From 82ae857877fdf3492f40bca87657a07892c3f59b Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 6 Dec 2019 06:29:24 +0000
Subject: Enable build of test/syscall tests on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I277d6c708bbf5c3edd7c3568941cfd01dc122e17
---
 test/syscalls/linux/BUILD       | 55 ++++++++++++++++++++++++++++++++++-------
 test/syscalls/linux/bad.cc      |  3 ++-
 test/syscalls/linux/chroot.cc   |  2 +-
 test/syscalls/linux/fork.cc     |  3 +++
 test/syscalls/linux/getdents.cc | 10 +++++++-
 test/syscalls/linux/preadv2.cc  |  2 ++
 test/syscalls/linux/proc.cc     |  2 +-
 test/syscalls/linux/pwritev2.cc |  2 ++
 test/syscalls/linux/seccomp.cc  |  5 ++++
 test/syscalls/linux/stat.cc     |  2 ++
 10 files changed, 73 insertions(+), 13 deletions(-)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 064ce8429..68dcc598b 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -19,6 +19,16 @@ exports_files(
     visibility = ["//:sandbox"],
 )
 
+config_setting(
+    name = "x86_64",
+    constraint_values = ["@bazel_tools//platforms:x86_64"],
+)
+
+config_setting(
+    name = "aarch64",
+    constraint_values = ["@bazel_tools//platforms:aarch64"],
+)
+
 cc_binary(
     name = "sigaltstack_check",
     testonly = 1,
@@ -197,7 +207,10 @@ cc_binary(
 cc_binary(
     name = "32bit_test",
     testonly = 1,
-    srcs = ["32bit.cc"],
+    srcs = select({
+	":x86_64": ["32bit.cc"],
+	":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:memory_util",
@@ -584,7 +597,10 @@ cc_binary(
 cc_binary(
     name = "exceptions_test",
     testonly = 1,
-    srcs = ["exceptions.cc"],
+    srcs = select({
+        ":x86_64": ["exceptions.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
@@ -640,7 +656,10 @@ cc_binary(
 cc_binary(
     name = "exec_binary_test",
     testonly = 1,
-    srcs = ["exec_binary.cc"],
+    srcs = select({
+        ":x86_64": ["exec_binary.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
@@ -811,7 +830,10 @@ cc_binary(
 cc_binary(
     name = "fpsig_fork_test",
     testonly = 1,
-    srcs = ["fpsig_fork.cc"],
+    srcs = select({
+        ":x86_64": ["fpsig_fork.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
@@ -825,7 +847,10 @@ cc_binary(
 cc_binary(
     name = "fpsig_nested_test",
     testonly = 1,
-    srcs = ["fpsig_nested.cc"],
+    srcs = select({
+        ":x86_64": ["fpsig_nested.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:test_main",
@@ -1440,7 +1465,10 @@ cc_binary(
 cc_binary(
     name = "arch_prctl_test",
     testonly = 1,
-    srcs = ["arch_prctl.cc"],
+    srcs = select({
+        ":x86_64": ["arch_prctl.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:test_main",
@@ -2035,7 +2063,10 @@ cc_binary(
 cc_binary(
     name = "sigiret_test",
     testonly = 1,
-    srcs = ["sigiret.cc"],
+    srcs = select({
+        ":x86_64": ["sigiret.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
@@ -2043,7 +2074,10 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:timer_util",
         "@com_google_googletest//:gtest",
-    ],
+    ] + select({
+        ":x86_64": [],
+        ":aarch64": ["//test/util:test_main"],
+	}),
 )
 
 cc_binary(
@@ -3260,7 +3294,10 @@ cc_binary(
 cc_binary(
     name = "sysret_test",
     testonly = 1,
-    srcs = ["sysret.cc"],
+    srcs = select({
+        ":x86_64": ["sysret.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
diff --git a/test/syscalls/linux/bad.cc b/test/syscalls/linux/bad.cc
index f246a799e..9e4d8ea57 100644
--- a/test/syscalls/linux/bad.cc
+++ b/test/syscalls/linux/bad.cc
@@ -22,12 +22,13 @@ namespace gvisor {
 namespace testing {
 
 namespace {
-
+#if defined(__x86_64__)
 TEST(BadSyscallTest, NotImplemented) {
   // get_kernel_syms is not supported in Linux > 2.6, and not implemented in
   // gVisor.
   EXPECT_THAT(syscall(SYS_get_kernel_syms), SyscallFailsWithErrno(ENOSYS));
 }
+#endif // defined(__x86_64__)
 
 TEST(BadSyscallTest, NegativeOne) {
   EXPECT_THAT(syscall(-1), SyscallFailsWithErrno(ENOSYS));
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 04bc2d7b9..0a2d44a2c 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -162,7 +162,7 @@ TEST(ChrootTest, DotDotFromOpenFD) {
 
   // getdents on fd should not error.
   char buf[1024];
-  ASSERT_THAT(syscall(SYS_getdents, fd.get(), buf, sizeof(buf)),
+  ASSERT_THAT(syscall(SYS_getdents64, fd.get(), buf, sizeof(buf)),
               SyscallSucceeds());
 }
 
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 371890110..906f3358d 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -215,6 +215,8 @@ TEST_F(ForkTest, PrivateMapping) {
   EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
 }
 
+// CPUID is x86 specific.
+#ifdef __x86_64__
 // Test that cpuid works after a fork.
 TEST_F(ForkTest, Cpuid) {
   pid_t child = Fork();
@@ -227,6 +229,7 @@ TEST_F(ForkTest, Cpuid) {
   }
   EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
 }
+#endif
 
 TEST_F(ForkTest, Mmap) {
   pid_t child = Fork();
diff --git a/test/syscalls/linux/getdents.cc b/test/syscalls/linux/getdents.cc
index ad2dbacb8..bfd18d4ff 100644
--- a/test/syscalls/linux/getdents.cc
+++ b/test/syscalls/linux/getdents.cc
@@ -228,19 +228,27 @@ class GetdentsTest : public ::testing::Test {
 
 // Multiple template parameters are not allowed, so we must use explicit
 // template specialization to set the syscall number.
+#ifdef __x86_64__
 template <>
 int GetdentsTest<struct linux_dirent>::SyscallNum() {
   return SYS_getdents;
 }
+#endif
 
 template <>
 int GetdentsTest<struct linux_dirent64>::SyscallNum() {
   return SYS_getdents64;
 }
 
-// Test both legacy getdents and getdents64.
+#ifdef __x86_64__
+// Test both legacy getdents and getdents64 on x86_64.
 typedef ::testing::Types<struct linux_dirent, struct linux_dirent64>
     GetdentsTypes;
+#elif __aarch64__
+// Test only getdents64 on arm64.
+typedef ::testing::Types<struct linux_dirent64>
+    GetdentsTypes;
+#endif
 TYPED_TEST_SUITE(GetdentsTest, GetdentsTypes);
 
 // N.B. TYPED_TESTs require explicitly using this-> to access members of
diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
index c9246367d..3eeaf6ad8 100644
--- a/test/syscalls/linux/preadv2.cc
+++ b/test/syscalls/linux/preadv2.cc
@@ -35,6 +35,8 @@ namespace {
 #ifndef SYS_preadv2
 #if defined(__x86_64__)
 #define SYS_preadv2 327
+#elif defined(__aarch64__)
+#define SYS_preadv2 286
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 8cf08991b..5b4f29cd9 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1986,7 +1986,7 @@ TEST(Proc, GetdentsEnoent) {
       },
       nullptr, nullptr));
   char buf[1024];
-  ASSERT_THAT(syscall(SYS_getdents, fd.get(), buf, sizeof(buf)),
+  ASSERT_THAT(syscall(SYS_getdents64, fd.get(), buf, sizeof(buf)),
               SyscallFailsWithErrno(ENOENT));
 }
 
diff --git a/test/syscalls/linux/pwritev2.cc b/test/syscalls/linux/pwritev2.cc
index 1dbc0d6df..3fe5a600f 100644
--- a/test/syscalls/linux/pwritev2.cc
+++ b/test/syscalls/linux/pwritev2.cc
@@ -34,6 +34,8 @@ namespace {
 #ifndef SYS_pwritev2
 #if defined(__x86_64__)
 #define SYS_pwritev2 328
+#elif defined(__aarch64__)
+#define SYS_pwritev2 287
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 7e41fe7d8..6d7e543b9 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -49,7 +49,12 @@ namespace testing {
 namespace {
 
 // A syscall not implemented by Linux that we don't expect to be called.
+#ifdef __x86_64__
 constexpr uint32_t kFilteredSyscall = SYS_vserver;
+#elif __aarch64__
+// Using arch_specific_syscalls which are not implemented on arm64.
+constexpr uint32_t kFilteredSyscall = SYS_arch_specific_syscall+15;
+#endif
 
 // Applies a seccomp-bpf filter that returns `filtered_result` for
 // `sysno` and allows all other syscalls. Async-signal-safe.
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 30de2f8ff..7a99f2636 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -557,6 +557,8 @@ TEST(SimpleStatTest, AnonDeviceAllocatesUniqueInodesAcrossSaveRestore) {
 #ifndef SYS_statx
 #if defined(__x86_64__)
 #define SYS_statx 332
+#elif defined(__aarch64__)
+#define SYS_statx 291
 #else
 #error "Unknown architecture"
 #endif
-- 
cgit v1.2.3


From 49e84b10e5ed7f94e6cbe003b9f7268e8235bb08 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 22 Jan 2020 06:22:18 +0000
Subject: Unify the kOLargeFile definition in syscall tests.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: Id9d6ae98305a4057d55d622ea4c3ac2228fea212
---
 test/syscalls/linux/fcntl.cc |  5 +----
 test/syscalls/linux/pipe.cc  |  6 +++---
 test/syscalls/linux/proc.cc  | 12 ------------
 test/util/fs_util.h          | 11 +++++++++++
 4 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 4f3aa81d6..421c15b87 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -31,6 +31,7 @@
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/cleanup.h"
 #include "test/util/eventfd_util.h"
+#include "test/util/fs_util.h"
 #include "test/util/multiprocess_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/save_util.h"
@@ -55,10 +56,6 @@ ABSL_FLAG(int32_t, socket_fd, -1,
 namespace gvisor {
 namespace testing {
 
-// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
-// because "it isn't needed", even though Linux can return it via F_GETFL.
-constexpr int kOLargeFile = 00100000;
-
 class FcntlLockTest : public ::testing::Test {
  public:
   void SetUp() override {
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index ac9b21b24..d8e19e910 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -25,6 +25,7 @@
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -144,11 +145,10 @@ TEST_P(PipeTest, Flags) {
 
   if (IsNamedPipe()) {
     // May be stubbed to zero; define locally.
-    constexpr int kLargefile = 0100000;
     EXPECT_THAT(fcntl(rfd_.get(), F_GETFL),
-                SyscallSucceedsWithValue(kLargefile | O_RDONLY));
+                SyscallSucceedsWithValue(kOLargeFile | O_RDONLY));
     EXPECT_THAT(fcntl(wfd_.get(), F_GETFL),
-                SyscallSucceedsWithValue(kLargefile | O_WRONLY));
+                SyscallSucceedsWithValue(kOLargeFile | O_WRONLY));
   } else {
     EXPECT_THAT(fcntl(rfd_.get(), F_GETFL), SyscallSucceedsWithValue(O_RDONLY));
     EXPECT_THAT(fcntl(wfd_.get(), F_GETFL), SyscallSucceedsWithValue(O_WRONLY));
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index bf9bb45d3..a03c1e43d 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -100,18 +100,6 @@ namespace {
 #define SUID_DUMP_ROOT 2
 #endif /* SUID_DUMP_ROOT */
 
-// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
-// because "it isn't needed", even though Linux can return it via F_GETFL.
-#if defined(__x86_64__) || defined(__i386__)
-constexpr int kOLargeFile = 00100000;
-#elif __aarch64__
-// The value originate from the Linux
-// kernel's arch/arm64/include/uapi/asm/fcntl.h.
-constexpr int kOLargeFile = 00400000;
-#else
-#error "Unknown architecture"
-#endif
-
 #if defined(__x86_64__) || defined(__i386__)
 // This list of "required" fields is taken from reading the file
 // arch/x86/kernel/cpu/proc.c and seeing which fields will be unconditionally
diff --git a/test/util/fs_util.h b/test/util/fs_util.h
index ee1b341d7..caf19b24d 100644
--- a/test/util/fs_util.h
+++ b/test/util/fs_util.h
@@ -26,6 +26,17 @@
 
 namespace gvisor {
 namespace testing {
+
+// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
+// because "it isn't needed", even though Linux can return it via F_GETFL.
+#if defined(__x86_64__)
+constexpr int kOLargeFile = 00100000;
+#elif defined(__aarch64__)
+constexpr int kOLargeFile = 00400000;
+#else
+#error "Unknown architecture"
+#endif
+
 // Returns a status or the current working directory.
 PosixErrorOr<std::string> GetCWD();
 
-- 
cgit v1.2.3


From 51b783505b1ec164b02b48a0fd234509fba01a73 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 29 Jan 2020 15:41:51 -0800
Subject: Add support for TCP_DEFER_ACCEPT.

PiperOrigin-RevId: 292233574
---
 pkg/sentry/socket/netstack/netstack.go      |  22 ++++
 pkg/tcpip/tcpip.go                          |   6 ++
 pkg/tcpip/transport/tcp/BUILD               |   1 +
 pkg/tcpip/transport/tcp/accept.go           |  25 ++---
 pkg/tcpip/transport/tcp/connect.go          |  53 +++++++++-
 pkg/tcpip/transport/tcp/endpoint.go         |  26 ++++-
 pkg/tcpip/transport/tcp/forwarder.go        |   4 +-
 pkg/tcpip/transport/tcp/tcp_test.go         | 126 ++++++++++++++++++++++
 test/syscalls/linux/socket_inet_loopback.cc | 158 ++++++++++++++++++++++++++++
 test/syscalls/linux/tcp_socket.cc           |  53 ++++++++++
 10 files changed, 451 insertions(+), 23 deletions(-)

(limited to 'test/syscalls')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 8619cc506..049d04bf2 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1260,6 +1260,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(time.Duration(v) / time.Second), nil
 
+	case linux.TCP_DEFER_ACCEPT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.TCPDeferAcceptOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Second), nil
+
 	default:
 		emitUnimplementedEventTCP(t, name)
 	}
@@ -1713,6 +1725,16 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_DEFER_ACCEPT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+		v := int32(usermem.ByteOrder.Uint32(optVal))
+		if v < 0 {
+			v = 0
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))))
+
 	case linux.TCP_REPAIR_OPTIONS:
 		t.Kernel().EmitUnimplementedEvent(t)
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 59c9b3fb0..0fa141d58 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -626,6 +626,12 @@ type TCPLingerTimeoutOption time.Duration
 // before being marked closed.
 type TCPTimeWaitTimeoutOption time.Duration
 
+// TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
+// accept to return a completed connection only when there is data to be
+// read. This usually means the listening socket will drop the final ACK
+// for a handshake till the specified timeout until a segment with data arrives.
+type TCPDeferAcceptOption time.Duration
+
 // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
 // TTL value for multicast messages. The default is 1.
 type MulticastTTLOption uint8
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 4acd9fb9a..7b4a87a2d 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -57,6 +57,7 @@ go_library(
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index d469758eb..6101f2945 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -222,13 +222,13 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
 
 // createConnectingEndpoint creates a new endpoint in a connecting state, with
 // the connection parameters given by the arguments.
-func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
+func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create a new endpoint.
 	netProto := l.netProto
 	if netProto == 0 {
 		netProto = s.route.NetProto
 	}
-	n := newEndpoint(l.stack, netProto, nil)
+	n := newEndpoint(l.stack, netProto, queue)
 	n.v6only = l.v6only
 	n.ID = s.id
 	n.boundNICID = s.route.NICID()
@@ -273,16 +273,17 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 
 // createEndpoint creates a new endpoint in connected state and then performs
 // the TCP 3-way handshake.
-func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
+func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
 	isn := generateSecureISN(s.id, l.stack.Seed())
-	ep, err := l.createConnectingEndpoint(s, isn, irs, opts)
+	ep, err := l.createConnectingEndpoint(s, isn, irs, opts, queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// listenEP is nil when listenContext is used by tcp.Forwarder.
+	deferAccept := time.Duration(0)
 	if l.listenEP != nil {
 		l.listenEP.mu.Lock()
 		if l.listenEP.EndpointState() != StateListen {
@@ -290,13 +291,12 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 			return nil, tcpip.ErrConnectionAborted
 		}
 		l.addPendingEndpoint(ep)
+		deferAccept = l.listenEP.deferAccept
 		l.listenEP.mu.Unlock()
 	}
 
 	// Perform the 3-way handshake.
-	h := newHandshake(ep, seqnum.Size(ep.initialReceiveWindow()))
-
-	h.resetToSynRcvd(isn, irs, opts)
+	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
 		ep.Close()
 		if l.listenEP != nil {
@@ -377,16 +377,14 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 	defer e.decSynRcvdCount()
 	defer s.decRef()
 
-	n, err := ctx.createEndpointAndPerformHandshake(s, opts)
+	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{})
 	if err != nil {
 		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		e.stats.FailedConnectionAttempts.Increment()
 		return
 	}
 	ctx.removePendingEndpoint(n)
-	// Start the protocol goroutine.
-	wq := &waiter.Queue{}
-	n.startAcceptedLoop(wq)
+	n.startAcceptedLoop()
 	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 
 	e.deliverAccepted(n)
@@ -546,7 +544,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
 		}
 
-		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions)
+		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions, &waiter.Queue{})
 		if err != nil {
 			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 			e.stats.FailedConnectionAttempts.Increment()
@@ -576,8 +574,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// space available in the backlog.
 
 		// Start the protocol goroutine.
-		wq := &waiter.Queue{}
-		n.startAcceptedLoop(wq)
+		n.startAcceptedLoop()
 		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 		go e.deliverAccepted(n)
 	}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4e3c5419c..9ff7ac261 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -86,6 +86,19 @@ type handshake struct {
 
 	// rcvWndScale is the receive window scale, as defined in RFC 1323.
 	rcvWndScale int
+
+	// startTime is the time at which the first SYN/SYN-ACK was sent.
+	startTime time.Time
+
+	// deferAccept if non-zero will drop the final ACK for a passive
+	// handshake till an ACK segment with data is received or the timeout is
+	// hit.
+	deferAccept time.Duration
+
+	// acked is true if the the final ACK for a 3-way handshake has
+	// been received. This is required to stop retransmitting the
+	// original SYN-ACK when deferAccept is enabled.
+	acked bool
 }
 
 func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
@@ -112,6 +125,12 @@ func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
 	return h
 }
 
+func newPassiveHandshake(ep *endpoint, rcvWnd seqnum.Size, isn, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) handshake {
+	h := newHandshake(ep, rcvWnd)
+	h.resetToSynRcvd(isn, irs, opts, deferAccept)
+	return h
+}
+
 // FindWndScale determines the window scale to use for the given maximum window
 // size.
 func FindWndScale(wnd seqnum.Size) int {
@@ -181,7 +200,7 @@ func (h *handshake) effectiveRcvWndScale() uint8 {
 
 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
 // state.
-func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) {
+func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) {
 	h.active = false
 	h.state = handshakeSynRcvd
 	h.flags = header.TCPFlagSyn | header.TCPFlagAck
@@ -189,6 +208,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.ackNum = irs + 1
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
+	h.deferAccept = deferAccept
 	h.ep.mu.Lock()
 	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
@@ -352,6 +372,14 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 	// We have previously received (and acknowledged) the peer's SYN. If the
 	// peer acknowledges our SYN, the handshake is completed.
 	if s.flagIsSet(header.TCPFlagAck) {
+		// If deferAccept is not zero and this is a bare ACK and the
+		// timeout is not hit then drop the ACK.
+		if h.deferAccept != 0 && s.data.Size() == 0 && time.Since(h.startTime) < h.deferAccept {
+			h.acked = true
+			h.ep.stack.Stats().DroppedPackets.Increment()
+			return nil
+		}
+
 		// If the timestamp option is negotiated and the segment does
 		// not carry a timestamp option then the segment must be dropped
 		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
@@ -365,10 +393,16 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
 		}
 		h.state = handshakeCompleted
+
 		h.ep.mu.Lock()
 		h.ep.transitionToStateEstablishedLocked(h)
+		// If the segment has data then requeue it for the receiver
+		// to process it again once main loop is started.
+		if s.data.Size() > 0 {
+			s.incRef()
+			h.ep.enqueueSegment(s)
+		}
 		h.ep.mu.Unlock()
-
 		return nil
 	}
 
@@ -471,6 +505,7 @@ func (h *handshake) execute() *tcpip.Error {
 		}
 	}
 
+	h.startTime = time.Now()
 	// Initialize the resend timer.
 	resendWaker := sleep.Waker{}
 	timeOut := time.Duration(time.Second)
@@ -524,11 +559,21 @@ func (h *handshake) execute() *tcpip.Error {
 		switch index, _ := s.Fetch(true); index {
 		case wakerForResend:
 			timeOut *= 2
-			if timeOut > 60*time.Second {
+			if timeOut > MaxRTO {
 				return tcpip.ErrTimeout
 			}
 			rt.Reset(timeOut)
-			h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+			// Resend the SYN/SYN-ACK only if the following conditions hold.
+			//  - It's an active handshake (deferAccept does not apply)
+			//  - It's a passive handshake and we have not yet got the final-ACK.
+			//  - It's a passive handshake and we got an ACK but deferAccept is
+			//    enabled and we are now past the deferAccept duration.
+			// The last is required to provide a way for the peer to complete
+			// the connection with another ACK or data (as ACKs are never
+			// retransmitted on their own).
+			if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept {
+				h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+			}
 
 		case wakerForNotification:
 			n := h.ep.fetchNotifications()
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 13718ff55..8d52414b7 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -498,6 +498,13 @@ type endpoint struct {
 	// without any data being acked.
 	userTimeout time.Duration
 
+	// deferAccept if non-zero specifies a user specified time during
+	// which the final ACK of a handshake will be dropped provided the
+	// ACK is a bare ACK and carries no data. If the timeout is crossed then
+	// the bare ACK is accepted and the connection is delivered to the
+	// listener.
+	deferAccept time.Duration
+
 	// pendingAccepted is a synchronization primitive used to track number
 	// of connections that are queued up to be delivered to the accepted
 	// channel. We use this to ensure that all goroutines blocked on writing
@@ -1574,6 +1581,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.TCPDeferAcceptOption:
+		e.mu.Lock()
+		if time.Duration(v) > MaxRTO {
+			v = tcpip.TCPDeferAcceptOption(MaxRTO)
+		}
+		e.deferAccept = time.Duration(v)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return nil
 	}
@@ -1798,6 +1814,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case *tcpip.TCPDeferAcceptOption:
+		e.mu.Lock()
+		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -2149,9 +2171,8 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
-func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
+func (e *endpoint) startAcceptedLoop() {
 	e.mu.Lock()
-	e.waiterQueue = waiterQueue
 	e.workerRunning = true
 	e.mu.Unlock()
 	wakerInitDone := make(chan struct{})
@@ -2177,7 +2198,6 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	default:
 		return nil, nil, tcpip.ErrWouldBlock
 	}
-
 	return n, n.waiterQueue, nil
 }
 
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 7eb613be5..c9ee5bf06 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -157,13 +157,13 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 		TSVal:         r.synOptions.TSVal,
 		TSEcr:         r.synOptions.TSEcr,
 		SACKPermitted: r.synOptions.SACKPermitted,
-	})
+	}, queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// Start the protocol goroutine.
-	ep.startAcceptedLoop(queue)
+	ep.startAcceptedLoop()
 
 	return ep, nil
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index df2fb1071..a12336d47 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -6787,3 +6787,129 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
 		),
 	)
 }
+
+func TestTCPDeferAccept(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give a bit of time for the socket to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
+
+func TestTCPDeferAcceptTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Sleep for a little of the tcpDeferAccept timeout.
+	time.Sleep(tcpDeferAccept + 100*time.Millisecond)
+
+	// On timeout expiry we should get a SYN-ACK retransmission.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+		checker.AckNum(uint32(irs)+1)))
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give sometime for the endpoint to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 2f9821555..3bf7081b9 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -828,6 +828,164 @@ TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
   EXPECT_EQ(get, kUserTimeout);
 }
 
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAccept_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+  // saved. Enable S/R issue is fixed.
+  DisableSave ds;
+
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the TCP_DEFER_ACCEPT on the listening socket.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Set the listening socket to nonblock so that we can verify that there is no
+  // connection in queue despite the connect above succeeding since the peer has
+  // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+  // FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Set FD back to blocking.
+  opts &= ~O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Now write some data to the socket.
+  int data = 0;
+  ASSERT_THAT(RetryEINTR(write)(conn_fd.get(), &data, sizeof(data)),
+              SyscallSucceedsWithValue(sizeof(data)));
+
+  // This should now cause the connection to complete and be delivered to the
+  // accept socket.
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // Verify that the accepted socket returns the data written.
+  int get = -1;
+  ASSERT_THAT(RetryEINTR(recv)(accepted.get(), &get, sizeof(get), 0),
+              SyscallSucceedsWithValue(sizeof(get)));
+
+  EXPECT_EQ(get, data);
+}
+
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAcceptTimeout_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+  // saved. Enable S/R once issue is fixed.
+  DisableSave ds;
+
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the TCP_DEFER_ACCEPT on the listening socket.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Set the listening socket to nonblock so that we can verify that there is no
+  // connection in queue despite the connect above succeeding since the peer has
+  // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+  // FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Verify that there is no acceptable connection before TCP_DEFER_ACCEPT
+  // timeout is hit.
+  absl::SleepFor(absl::Seconds(kTCPDeferAccept - 1));
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Set FD back to blocking.
+  opts &= ~O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Now sleep for a little over the TCP_DEFER_ACCEPT duration. When the timeout
+  // is hit a SYN-ACK should be retransmitted by the listener as a last ditch
+  // attempt to complete the connection with or without data.
+  absl::SleepFor(absl::Seconds(2));
+
+  // Verify that we have a connection that can be accepted even though no
+  // data was written.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetLoopbackTest,
     ::testing::Values(
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 33a5ac66c..525ccbd88 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1286,6 +1286,59 @@ TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) {
   EXPECT_EQ(get, kTCPUserTimeout);
 }
 
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptNeg) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // -ve TCP_DEFER_ACCEPT is same as setting it to zero.
+  constexpr int kNeg = -1;
+  EXPECT_THAT(
+      setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &kNeg, sizeof(kNeg)),
+      SyscallSucceeds());
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, GetTCPDeferAcceptDefault) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  // kTCPDeferAccept is in seconds.
+  // NOTE: linux translates seconds to # of retries and back from
+  //   #of retries to seconds. Which means only certain values
+  //   translate back exactly. That's why we use 3 here, a value of
+  //   5 will result in us getting back 7 instead of 5 in the
+  //   getsockopt.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len),
+      SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kTCPDeferAccept);
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From ede8dfab3760afc8063c3418f217e52f7ec70d42 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 30 Jan 2020 09:13:36 -0800
Subject: Enforce splice offset limits

Splice must not allow negative offsets. Writes also must not allow offset +
size to overflow int64. Reads are similarly broken, but not just in splice
(b/148095030).

Reported-by: syzbot+0e1ff0b95fb2859b4190@syzkaller.appspotmail.com
PiperOrigin-RevId: 292361208
---
 pkg/sentry/fs/tmpfs/inode_file.go       | 10 ++++--
 pkg/sentry/syscalls/linux/sys_splice.go | 16 ++++------
 test/syscalls/linux/splice.cc           | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 12 deletions(-)

(limited to 'test/syscalls')

diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index dabc10662..25abbc151 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -17,6 +17,7 @@ package tmpfs
 import (
 	"fmt"
 	"io"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -444,10 +445,15 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 	defer rw.f.dataMu.Unlock()
 
 	// Compute the range to write.
-	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
-	if end == rw.offset { // srcs.NumBytes() == 0?
+	if srcs.NumBytes() == 0 {
+		// Nothing to do.
 		return 0, nil
 	}
+	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
+	if end == math.MaxInt64 {
+		// Overflow.
+		return 0, syserror.EINVAL
+	}
 
 	// Check if seals prevent either file growth or all writes.
 	switch {
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index f43d6c155..fd642834b 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -25,6 +25,10 @@ import (
 
 // doSplice implements a blocking splice operation.
 func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
+	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 {
+		return 0, syserror.EINVAL
+	}
+
 	var (
 		total int64
 		n     int64
@@ -82,11 +86,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	offsetAddr := args[2].Pointer()
 	count := int64(args[3].SizeT())
 
-	// Don't send a negative number of bytes.
-	if count < 0 {
-		return 0, nil, syserror.EINVAL
-	}
-
 	// Get files.
 	inFile := t.GetFile(inFD)
 	if inFile == nil {
@@ -136,11 +135,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			return 0, nil, err
 		}
 
-		// The offset must be valid.
-		if offset < 0 {
-			return 0, nil, syserror.EINVAL
-		}
-
 		// Do the splice.
 		n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
 			Length:    count,
@@ -227,6 +221,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			if _, err := t.CopyIn(outOffset, &offset); err != nil {
 				return 0, nil, err
 			}
+
 			// Use the destination offset.
 			opts.DstOffset = true
 			opts.DstStart = offset
@@ -244,6 +239,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			if _, err := t.CopyIn(inOffset, &offset); err != nil {
 				return 0, nil, err
 			}
+
 			// Use the source offset.
 			opts.SrcOffset = true
 			opts.SrcStart = offset
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index 85232cb1f..faa1247f6 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -60,6 +60,62 @@ TEST(SpliceTest, TwoRegularFiles) {
               SyscallFailsWithErrno(EINVAL));
 }
 
+int memfd_create(const std::string& name, unsigned int flags) {
+  return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST(SpliceTest, NegativeOffset) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill the pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Open the output file as write only.
+  int fd;
+  EXPECT_THAT(fd = memfd_create("negative", 0), SyscallSucceeds());
+  const FileDescriptor out_fd(fd);
+
+  loff_t out_offset = 0xffffffffffffffffull;
+  constexpr int kSize = 2;
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Write offset + size overflows int64.
+//
+// This is a regression test for b/148041624.
+TEST(SpliceTest, WriteOverflow) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill the pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Open the output file.
+  int fd;
+  EXPECT_THAT(fd = memfd_create("overflow", 0), SyscallSucceeds());
+  const FileDescriptor out_fd(fd);
+
+  // out_offset + kSize overflows INT64_MAX.
+  loff_t out_offset = 0x7ffffffffffffffeull;
+  constexpr int kSize = 3;
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(SpliceTest, SamePipe) {
   // Create a new pipe.
   int fds[2];
-- 
cgit v1.2.3


From 4ee64a248ec16fcc9e526a457a66648546611bfb Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 30 Jan 2020 11:48:36 -0800
Subject: Fix for panic in endpoint.Close().

When sending a RST on shutdown we need to double check the
state after acquiring the work mutex as the endpoint could
have transitioned out of a connected state from the time
we checked it and we acquired the workMutex.

I added two tests but sadly neither reproduce the panic. I am
going to leave the tests in as they are good to have anyway.

PiperOrigin-RevId: 292393800
---
 pkg/tcpip/transport/tcp/BUILD                |  1 +
 pkg/tcpip/transport/tcp/endpoint.go          | 10 ++++-
 pkg/tcpip/transport/tcp/tcp_test.go          | 55 ++++++++++++++++++++++++++++
 test/syscalls/linux/BUILD                    |  1 +
 test/syscalls/linux/socket_ip_tcp_generic.cc | 33 +++++++++++++++++
 5 files changed, 98 insertions(+), 2 deletions(-)

(limited to 'test/syscalls')

diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 7b4a87a2d..272e8f570 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -91,6 +91,7 @@ go_test(
     tags = ["flaky"],
     deps = [
         ":tcp",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 8d52414b7..b5a8e15ee 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2047,8 +2047,14 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 				// work mutex is available.
 				if e.workMu.TryLock() {
 					e.mu.Lock()
-					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
-					e.notifyProtocolGoroutine(notifyTickleWorker)
+					// We need to double check here to make
+					// sure worker has not transitioned the
+					// endpoint out of a connected state
+					// before trying to send a reset.
+					if e.EndpointState().connected() {
+						e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+						e.notifyProtocolGoroutine(notifyTickleWorker)
+					}
 					e.mu.Unlock()
 					e.workMu.Unlock()
 				} else {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index a12336d47..2c1505067 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -21,6 +21,7 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
@@ -6913,3 +6914,57 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 		checker.SeqNum(uint32(iss+1)),
 		checker.AckNum(uint32(irs+5))))
 }
+
+func TestResetDuringClose(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	iss := seqnum.Value(789)
+	c.CreateConnected(iss, 30000, -1 /* epRecvBuf */)
+	// Send some data to make sure there is some unread
+	// data to trigger a reset on c.Close.
+	irs := c.IRS
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss.Add(1),
+		AckNum:  irs.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(irs.Add(1))),
+		checker.AckNum(uint32(iss.Add(5)))))
+
+	// Close in a separate goroutine so that we can trigger
+	// a race with the RST we send below. This should not
+	// panic due to the route being released depeding on
+	// whether Close() sends an active RST or the RST sent
+	// below is processed by the worker first.
+	var wg sync.WaitGroup
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		c.SendPacket(nil, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			SeqNum:  iss.Add(5),
+			AckNum:  c.IRS.Add(5),
+			RcvWnd:  30000,
+			Flags:   header.TCPFlagRst,
+		})
+	}()
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		c.EP.Close()
+	}()
+
+	wg.Wait()
+}
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 74bf068ec..7958fd0d7 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2173,6 +2173,7 @@ cc_library(
         ":socket_test_util",
         "//test/util:test_util",
         "//test/util:thread_util",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 57ce8e169..27779e47c 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -24,6 +24,7 @@
 #include <sys/un.h>
 
 #include "gtest/gtest.h"
+#include "absl/memory/memory.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
@@ -875,5 +876,37 @@ TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) {
   EXPECT_EQ(get, kAbove);
 }
 
+TEST_P(TCPSocketPairTest, TCPResetDuringClose_NoRandomSave) {
+  DisableSave ds;  // Too many syscalls.
+  constexpr int kThreadCount = 1000;
+  std::unique_ptr<ScopedThread> instances[kThreadCount];
+  for (int i = 0; i < kThreadCount; i++) {
+    instances[i] = absl::make_unique<ScopedThread>([&]() {
+      auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+      ScopedThread t([&]() {
+        // Close one end to trigger sending of a FIN.
+        struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+        // Wait up to 20 seconds for the data.
+        constexpr int kPollTimeoutMs = 20000;
+        ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+                    SyscallSucceedsWithValue(1));
+        ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+      });
+
+      // Send some data then close.
+      constexpr char kStr[] = "abc";
+      ASSERT_THAT(write(sockets->first_fd(), kStr, 3),
+                  SyscallSucceedsWithValue(3));
+      absl::SleepFor(absl::Milliseconds(10));
+      ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+      t.Join();
+    });
+  }
+  for (int i = 0; i < kThreadCount; i++) {
+    instances[i]->Join();
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 9988cf2eeff596ce519046d80c54d09166f7d84b Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Thu, 30 Jan 2020 14:06:54 -0800
Subject: Wrap all GetSocketPairs() in unnamed namespaces

This avoids conflicting definitions of GetSocketPairs() in outer namespace when
multiple such cc files are complied for one binary.

PiperOrigin-RevId: 292420885
---
 test/syscalls/linux/socket_abstract.cc                       | 2 ++
 test/syscalls/linux/socket_filesystem.cc                     | 2 ++
 test/syscalls/linux/socket_ip_tcp_generic_loopback.cc        | 2 ++
 test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc       | 2 ++
 test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc       | 2 ++
 test/syscalls/linux/socket_ip_udp_loopback.cc                | 2 ++
 test/syscalls/linux/socket_ip_udp_loopback_blocking.cc       | 2 ++
 test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc       | 2 ++
 test/syscalls/linux/socket_unix_abstract_nonblock.cc         | 2 ++
 test/syscalls/linux/socket_unix_blocking_local.cc            | 2 ++
 test/syscalls/linux/socket_unix_dgram_local.cc               | 2 ++
 test/syscalls/linux/socket_unix_domain.cc                    | 2 ++
 test/syscalls/linux/socket_unix_filesystem_nonblock.cc       | 2 ++
 test/syscalls/linux/socket_unix_non_stream_blocking_local.cc | 2 ++
 test/syscalls/linux/socket_unix_pair.cc                      | 2 ++
 test/syscalls/linux/socket_unix_pair_nonblock.cc             | 2 ++
 test/syscalls/linux/socket_unix_seqpacket_local.cc           | 2 ++
 test/syscalls/linux/socket_unix_stream_blocking_local.cc     | 2 ++
 test/syscalls/linux/socket_unix_stream_local.cc              | 2 ++
 test/syscalls/linux/socket_unix_stream_nonblock_local.cc     | 2 ++
 20 files changed, 40 insertions(+)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/socket_abstract.cc b/test/syscalls/linux/socket_abstract.cc
index 715d87b76..00999f192 100644
--- a/test/syscalls/linux/socket_abstract.cc
+++ b/test/syscalls/linux/socket_abstract.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -43,5 +44,6 @@ INSTANTIATE_TEST_SUITE_P(
     AbstractUnixSockets, UnixSocketPairCmsgTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_filesystem.cc b/test/syscalls/linux/socket_filesystem.cc
index 74e262959..287359363 100644
--- a/test/syscalls/linux/socket_filesystem.cc
+++ b/test/syscalls/linux/socket_filesystem.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -43,5 +44,6 @@ INSTANTIATE_TEST_SUITE_P(
     FilesystemUnixSockets, UnixSocketPairCmsgTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
index d11f7cc23..4e79d21f4 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVecToVec<SocketPairKind>(
@@ -39,5 +40,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllTCPSockets, TCPSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
index fcd20102f..f996b93d2 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVecToVec<SocketPairKind>(
@@ -39,5 +40,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingTCPSockets, BlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
index 63a05b799..ffa377210 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVecToVec<SocketPairKind>(
@@ -38,5 +39,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingTCPSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback.cc b/test/syscalls/linux/socket_ip_udp_loopback.cc
index 1df74a348..c7fa44884 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -44,5 +45,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllUDPSockets, UDPSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
index 1e259efa7..d6925a8df 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingUDPSockets, BlockingNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
index 74cbd326d..d675eddc6 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingUDPSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_abstract_nonblock.cc b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
index be31ab2a7..8bef76b67 100644
--- a/test/syscalls/linux/socket_unix_abstract_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingAbstractUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc
index 6f84221b2..77cb8c6d6 100644
--- a/test/syscalls/linux/socket_unix_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_blocking_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(
@@ -39,5 +40,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingUnixDomainSockets, BlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_dgram_local.cc b/test/syscalls/linux/socket_unix_dgram_local.cc
index 9134fcdf7..31d2d5216 100644
--- a/test/syscalls/linux/socket_unix_dgram_local.cc
+++ b/test/syscalls/linux/socket_unix_dgram_local.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(VecCat<SocketPairKind>(
@@ -52,5 +53,6 @@ INSTANTIATE_TEST_SUITE_P(
     DgramUnixSockets, NonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_domain.cc b/test/syscalls/linux/socket_unix_domain.cc
index fa3efc7f8..f7dff8b4d 100644
--- a/test/syscalls/linux/socket_unix_domain.cc
+++ b/test/syscalls/linux/socket_unix_domain.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, AllSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
index 8ba7af971..6700b4d90 100644
--- a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingFilesystemUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
index 8855d5001..fddcdf1c5 100644
--- a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(
@@ -36,5 +37,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingNonStreamUnixSockets, BlockingNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair.cc b/test/syscalls/linux/socket_unix_pair.cc
index 411fb4518..85999db04 100644
--- a/test/syscalls/linux/socket_unix_pair.cc
+++ b/test/syscalls/linux/socket_unix_pair.cc
@@ -22,6 +22,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
@@ -38,5 +39,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, UnixSocketPairCmsgTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair_nonblock.cc b/test/syscalls/linux/socket_unix_pair_nonblock.cc
index 3135d325f..281410a9a 100644
--- a/test/syscalls/linux/socket_unix_pair_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_pair_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_seqpacket_local.cc b/test/syscalls/linux/socket_unix_seqpacket_local.cc
index dff75a532..69a5f150d 100644
--- a/test/syscalls/linux/socket_unix_seqpacket_local.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket_local.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(VecCat<SocketPairKind>(
@@ -52,5 +53,6 @@ INSTANTIATE_TEST_SUITE_P(
     SeqpacketUnixSockets, UnixNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
index 08e579ba7..8429bd429 100644
--- a/test/syscalls/linux/socket_unix_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -34,5 +35,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingStreamUnixSockets, BlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_local.cc b/test/syscalls/linux/socket_unix_stream_local.cc
index 65eef1a81..a7e3449a9 100644
--- a/test/syscalls/linux/socket_unix_stream_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(
@@ -42,5 +43,6 @@ INSTANTIATE_TEST_SUITE_P(
     StreamUnixSockets, StreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
index 1936aa135..4b763c8e2 100644
--- a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
@@ -20,6 +20,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingStreamUnixSockets, NonBlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 14959250feb71df74dea13f3cb15dcbe8ce6b3f3 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 30 Jan 2020 17:37:17 -0800
Subject: Simplify testing link rules.

PiperOrigin-RevId: 292458933
---
 test/syscalls/linux/BUILD | 688 +++++++++++++++++++++++-----------------------
 test/util/BUILD           |  30 +-
 tools/build/defs.bzl      |   1 +
 tools/defs.bzl            |   3 +-
 tools/images/BUILD        |   4 +-
 5 files changed, 363 insertions(+), 363 deletions(-)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ee7a8a673..e4ca5b6db 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "select_arch", "select_system")
+load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "gtest", "select_arch", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -82,14 +82,14 @@ cc_library(
     srcs = ["base_poll_test.cc"],
     hdrs = ["base_poll_test.h"],
     deps = [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -99,11 +99,11 @@ cc_library(
     hdrs = ["file_base.h"],
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -130,7 +130,7 @@ cc_library(
     hdrs = ["socket_test_util.h"],
     defines = select_system(),
     deps = default_net_util() + [
-        "@com_google_googletest//:gtest",
+        gtest,
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -155,9 +155,9 @@ cc_library(
     hdrs = ["unix_domain_socket_test_util.h"],
     deps = [
         ":socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
 )
 
@@ -179,14 +179,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -199,13 +199,13 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:memory_util",
         "//test/util:platform_util",
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -218,9 +218,9 @@ cc_binary(
         ":socket_test_util",
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -233,9 +233,9 @@ cc_binary(
         ":socket_test_util",
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -247,10 +247,10 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -262,12 +262,12 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -280,12 +280,11 @@ cc_binary(
     ],
     linkstatic = 1,
     deps = [
-        # The heapchecker doesn't recognize that io_destroy munmaps.
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:proc_util",
@@ -302,12 +301,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -320,9 +319,9 @@ cc_binary(
         "//:sandbox",
     ],
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -334,9 +333,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -348,9 +347,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -372,10 +371,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -388,10 +387,10 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -404,14 +403,14 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/synchronization",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -424,12 +423,12 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/flags:flag",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -443,12 +442,12 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:mount_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -458,9 +457,9 @@ cc_binary(
     srcs = ["clock_getres.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -470,11 +469,11 @@ cc_binary(
     srcs = ["clock_gettime.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -484,13 +483,13 @@ cc_binary(
     srcs = ["concurrency.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:platform_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -503,9 +502,9 @@ cc_binary(
         ":socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -516,10 +515,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -530,9 +529,9 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -544,11 +543,11 @@ cc_binary(
     deps = [
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -561,10 +560,10 @@ cc_binary(
         "//test/util:epoll_util",
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -576,10 +575,10 @@ cc_binary(
     deps = [
         "//test/util:epoll_util",
         "//test/util:eventfd_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -589,12 +588,12 @@ cc_binary(
     srcs = ["exceptions.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:platform_util",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -604,10 +603,10 @@ cc_binary(
     srcs = ["getcpu.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -617,10 +616,10 @@ cc_binary(
     srcs = ["getcpu.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -630,13 +629,13 @@ cc_binary(
     srcs = ["getrusage.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -652,14 +651,14 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:proc_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -682,15 +681,15 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:optional",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -701,11 +700,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -718,10 +717,10 @@ cc_binary(
         ":file_base",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -731,9 +730,9 @@ cc_binary(
     srcs = ["fault.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -744,10 +743,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -761,18 +760,18 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:eventfd_util",
         "//test/util:fs_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:save_util",
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -786,15 +785,15 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -805,13 +804,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -824,11 +823,11 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -841,10 +840,10 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -855,10 +854,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -869,10 +868,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -884,6 +883,9 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:memory_util",
         "//test/util:save_util",
         "//test/util:temp_path",
@@ -892,9 +894,6 @@ cc_binary(
         "//test/util:thread_util",
         "//test/util:time_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -907,12 +906,12 @@ cc_binary(
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -922,9 +921,9 @@ cc_binary(
     srcs = ["getrandom.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -957,10 +956,10 @@ cc_binary(
         ":socket_test_util",
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -984,9 +983,9 @@ cc_binary(
         ":socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -997,6 +996,9 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -1004,9 +1006,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1018,15 +1017,15 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1039,14 +1038,14 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1057,10 +1056,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1071,6 +1070,7 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
@@ -1078,7 +1078,6 @@ cc_binary(
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1089,12 +1088,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        "@com_google_absl//absl/memory",
+        gtest,
         "//test/util:memory_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1104,11 +1103,11 @@ cc_binary(
     srcs = ["mincore.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1121,10 +1120,10 @@ cc_binary(
         ":temp_umask",
         "//test/util:capability_util",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1135,11 +1134,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1151,12 +1150,12 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:cleanup",
+        gtest,
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
         "//test/util:rlimit_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1169,13 +1168,13 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1188,6 +1187,9 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:mount_util",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -1195,9 +1197,6 @@ cc_binary(
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1207,10 +1206,9 @@ cc_binary(
     srcs = ["mremap.cc"],
     linkstatic = 1,
     deps = [
-        # The heap check fails due to MremapDeathTest
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
@@ -1242,9 +1240,9 @@ cc_binary(
     srcs = ["munmap.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1261,14 +1259,14 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1282,10 +1280,10 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1299,11 +1297,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1317,11 +1315,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1333,16 +1331,16 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:pty_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1354,12 +1352,12 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:posix_error",
         "//test/util:pty_util",
         "//test/util:test_main",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1372,12 +1370,12 @@ cc_binary(
         "//test/syscalls/linux:socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1387,13 +1385,13 @@ cc_binary(
     srcs = ["pause.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1405,15 +1403,15 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1426,13 +1424,13 @@ cc_binary(
         ":base_poll_test",
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1443,11 +1441,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         ":base_poll_test",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1458,9 +1456,9 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1472,12 +1470,12 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:cleanup",
+        "@com_google_absl//absl/flags:flag",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1488,13 +1486,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/flags:flag",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1505,10 +1503,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1519,6 +1517,8 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:temp_path",
@@ -1526,8 +1526,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1541,13 +1539,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1559,11 +1557,11 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1577,6 +1575,10 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:temp_path",
@@ -1584,10 +1586,6 @@ cc_binary(
         "//test/util:thread_util",
         "//test/util:time_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1601,11 +1599,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1617,17 +1615,17 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:proc_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1641,6 +1639,8 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -1648,8 +1648,6 @@ cc_binary(
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1660,11 +1658,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         ":base_poll_test",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1674,6 +1672,9 @@ cc_binary(
     srcs = ["ptrace.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:platform_util",
@@ -1681,9 +1682,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1693,10 +1691,10 @@ cc_binary(
     srcs = ["pwrite64.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1710,12 +1708,12 @@ cc_binary(
     deps = [
         ":file_base",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1729,11 +1727,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1747,10 +1745,10 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1764,10 +1762,10 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1778,10 +1776,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1792,10 +1790,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1811,13 +1809,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1832,12 +1830,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1851,11 +1849,11 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1879,11 +1877,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/syscalls/linux/rseq:lib",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1894,11 +1892,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        gtest,
         "//test/util:logging",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1908,9 +1906,9 @@ cc_binary(
     srcs = ["sched.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1920,9 +1918,9 @@ cc_binary(
     srcs = ["sched_yield.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1932,6 +1930,8 @@ cc_binary(
     srcs = ["seccomp.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
@@ -1939,8 +1939,6 @@ cc_binary(
         "//test/util:proc_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1952,14 +1950,14 @@ cc_binary(
     deps = [
         ":base_poll_test",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:rlimit_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1971,13 +1969,13 @@ cc_binary(
     deps = [
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1989,12 +1987,12 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2005,13 +2003,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2021,9 +2019,9 @@ cc_binary(
     srcs = ["sigaction.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2038,13 +2036,13 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:fs_util",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2057,7 +2055,7 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
-        "@com_google_googletest//:gtest",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
@@ -2075,14 +2073,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/synchronization",
+        gtest,
         "//test/util:logging",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2092,10 +2090,10 @@ cc_binary(
     srcs = ["sigprocmask.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2105,13 +2103,13 @@ cc_binary(
     srcs = ["sigstop.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2122,13 +2120,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2144,10 +2142,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
     alwayslink = 1,
 )
@@ -2160,8 +2158,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2174,8 +2172,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2191,11 +2189,11 @@ cc_library(
     ],
     deps = [
         ":socket_test_util",
-        "//test/util:test_util",
-        "//test/util:thread_util",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
+        "//test/util:thread_util",
     ],
     alwayslink = 1,
 )
@@ -2212,8 +2210,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2230,9 +2228,9 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:memory_util",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2250,8 +2248,8 @@ cc_library(
         ":ip_socket_test_util",
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2268,8 +2266,8 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2286,9 +2284,9 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/memory",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
     alwayslink = 1,
 )
@@ -2305,8 +2303,8 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2323,8 +2321,8 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2387,9 +2385,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2419,9 +2417,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2451,9 +2449,9 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2551,10 +2549,10 @@ cc_binary(
         ":socket_bind_to_device_util",
         ":socket_test_util",
         "//test/util:capability_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2570,10 +2568,10 @@ cc_binary(
         ":socket_bind_to_device_util",
         ":socket_test_util",
         "//test/util:capability_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2589,10 +2587,10 @@ cc_binary(
         ":socket_bind_to_device_util",
         ":socket_test_util",
         "//test/util:capability_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2638,9 +2636,9 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2719,15 +2717,15 @@ cc_binary(
         ":ip_socket_test_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:save_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2739,9 +2737,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2755,10 +2753,10 @@ cc_binary(
         ":socket_test_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings:str_format",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2771,9 +2769,9 @@ cc_binary(
         ":socket_netlink_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2791,9 +2789,9 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
     alwayslink = 1,
 )
@@ -2810,11 +2808,11 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2831,10 +2829,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2851,10 +2849,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2871,11 +2869,11 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2892,8 +2890,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2910,10 +2908,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -3007,9 +3005,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3021,9 +3019,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3035,9 +3033,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3052,9 +3050,9 @@ cc_binary(
         ":socket_blocking_test_cases",
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3069,9 +3067,9 @@ cc_binary(
         ":ip_socket_test_util",
         ":socket_blocking_test_cases",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3086,9 +3084,9 @@ cc_binary(
         ":socket_non_stream_blocking_test_cases",
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3103,9 +3101,9 @@ cc_binary(
         ":ip_socket_test_util",
         ":socket_non_stream_blocking_test_cases",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3121,9 +3119,9 @@ cc_binary(
         ":socket_unix_cmsg_test_cases",
         ":socket_unix_test_cases",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3135,9 +3133,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3149,9 +3147,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3164,10 +3162,10 @@ cc_binary(
         ":socket_netlink_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:endian",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3183,12 +3181,12 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3199,11 +3197,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3217,12 +3215,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3235,10 +3233,10 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3248,10 +3246,10 @@ cc_binary(
     srcs = ["sync.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3261,10 +3259,10 @@ cc_binary(
     srcs = ["sysinfo.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3274,9 +3272,9 @@ cc_binary(
     srcs = ["syslog.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3286,10 +3284,10 @@ cc_binary(
     srcs = ["sysret.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3301,12 +3299,12 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3316,11 +3314,11 @@ cc_binary(
     srcs = ["tgkill.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3330,10 +3328,10 @@ cc_binary(
     srcs = ["time.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:proc_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3358,15 +3356,15 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3376,11 +3374,11 @@ cc_binary(
     srcs = ["tkill.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3394,11 +3392,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3414,12 +3412,12 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -3442,9 +3440,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3455,14 +3453,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:uid_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3473,11 +3471,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3490,11 +3488,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3504,11 +3502,11 @@ cc_binary(
     srcs = ["unshare.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/synchronization",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3534,11 +3532,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:fs_util",
+        gtest,
         "//test/util:posix_error",
         "//test/util:proc_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3548,13 +3546,13 @@ cc_binary(
     srcs = ["vfork.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:test_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3566,6 +3564,10 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -3574,10 +3576,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3588,10 +3586,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3602,12 +3600,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3618,14 +3616,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
-        "//test/util:thread_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
     ],
 )
 
@@ -3651,10 +3649,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3664,11 +3662,11 @@ cc_binary(
     srcs = ["vdso_clock_gettime.cc"],
     linkstatic = 1,
     deps = [
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -3678,10 +3676,10 @@ cc_binary(
     srcs = ["vsyscall.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:proc_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3694,11 +3692,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -3710,12 +3708,12 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3727,10 +3725,10 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3742,10 +3740,10 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3761,11 +3759,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
diff --git a/test/util/BUILD b/test/util/BUILD
index 1ac8b3fd6..1f22ebe29 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_library", "cc_test", "select_system")
+load("//tools:defs.bzl", "cc_library", "cc_test", "gtest", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -41,7 +41,7 @@ cc_library(
         ":save_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -55,7 +55,7 @@ cc_library(
         ":posix_error",
         ":test_util",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -67,7 +67,7 @@ cc_test(
         ":proc_util",
         ":test_main",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -87,7 +87,7 @@ cc_library(
         ":file_descriptor",
         ":posix_error",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -101,7 +101,7 @@ cc_test(
         ":temp_path",
         ":test_main",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -134,7 +134,7 @@ cc_library(
         ":cleanup",
         ":posix_error",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -183,7 +183,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -194,7 +194,7 @@ cc_test(
     deps = [
         ":posix_error",
         ":test_main",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -218,7 +218,7 @@ cc_library(
         ":cleanup",
         ":posix_error",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -233,7 +233,7 @@ cc_library(
         ":test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -259,7 +259,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -291,7 +291,7 @@ cc_library(
         ":posix_error",
         ":test_util",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -302,7 +302,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -322,7 +322,7 @@ cc_library(
         ":file_descriptor",
         ":posix_error",
         ":save_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
diff --git a/tools/build/defs.bzl b/tools/build/defs.bzl
index d0556abd1..967c1f900 100644
--- a/tools/build/defs.bzl
+++ b/tools/build/defs.bzl
@@ -18,6 +18,7 @@ cc_test = _cc_test
 cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
 go_image = _go_image
 go_embed_data = _go_embed_data
+gtest = "@com_google_googletest//:gtest"
 loopback = "//tools/build:loopback"
 proto_library = native.proto_library
 pkg_deb = _pkg_deb
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 819f12b0d..ce677cbbf 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,7 +7,7 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/build:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/build:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
 
 # Delegate directly.
 cc_binary = _cc_binary
@@ -20,6 +20,7 @@ go_embed_data = _go_embed_data
 go_image = _go_image
 go_test = _go_test
 go_tool_library = _go_tool_library
+gtest = _gtest
 pkg_deb = _pkg_deb
 pkg_tar = _pkg_tar
 py_library = _py_library
diff --git a/tools/images/BUILD b/tools/images/BUILD
index f1699b184..fe11f08a3 100644
--- a/tools/images/BUILD
+++ b/tools/images/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_binary")
+load("//tools:defs.bzl", "cc_binary", "gtest")
 load("//tools/images:defs.bzl", "vm_image", "vm_test")
 
 package(
@@ -32,8 +32,8 @@ cc_binary(
     srcs = ["test.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
-        "@com_google_googletest//:gtest",
     ],
 )
 
-- 
cgit v1.2.3


From 04cccaaeeed22a28a42fc4c1406b43a966a5d886 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 31 Jan 2020 14:44:50 -0800
Subject: Fix logic around AMD/Intel cases.

If the support is Ignored, then the call is still executed. We
simply rely on it to fall through to the int3. Therefore, we
must also bail on the vendor check.

PiperOrigin-RevId: 292620558
---
 test/syscalls/linux/32bit.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 2751fb4e7..9883aef61 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -102,7 +102,8 @@ TEST(Syscall32Bit, Int80) {
 }
 
 TEST(Syscall32Bit, Sysenter) {
-  if (PlatformSupport32Bit() == PlatformSupport::Allowed &&
+  if ((PlatformSupport32Bit() == PlatformSupport::Allowed ||
+       PlatformSupport32Bit() == PlatformSupport::Ignored) &&
       GetCPUVendor() == CPUVendor::kAMD) {
     // SYSENTER is an illegal instruction in compatibility mode on AMD.
     EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
@@ -133,7 +134,8 @@ TEST(Syscall32Bit, Sysenter) {
 }
 
 TEST(Syscall32Bit, Syscall) {
-  if (PlatformSupport32Bit() == PlatformSupport::Allowed &&
+  if ((PlatformSupport32Bit() == PlatformSupport::Allowed ||
+       PlatformSupport32Bit() == PlatformSupport::Ignored) &&
       GetCPUVendor() == CPUVendor::kIntel) {
     // SYSCALL is an illegal instruction in compatibility mode on Intel.
     EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
-- 
cgit v1.2.3


From e7846e50f2df070a15dd33235b334e2223f715f3 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Mon, 3 Feb 2020 15:30:28 -0800
Subject: Reduce run time for
 //test/syscalls:socket_inet_loopback_test_runsc_ptrace.

* Tests are picked for a shard differently. It now picks one test from each
  block, instead of picking the whole block. This makes the same kind of tests
  spreads across different shards.

* Reduce the number of connect() calls in TCPListenClose.

PiperOrigin-RevId: 293019281
---
 runsc/testutil/testutil.go                  | 61 ++++++++++++++---------------
 test/runtimes/runner.go                     |  9 ++---
 test/syscalls/linux/socket_inet_loopback.cc | 13 +++---
 test/syscalls/syscall_test_runner.go        |  7 ++--
 4 files changed, 43 insertions(+), 47 deletions(-)

(limited to 'test/syscalls')

diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index fb22eae39..5d0b0ae54 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -434,43 +434,40 @@ func IsStatic(filename string) (bool, error) {
 	return true, nil
 }
 
-// TestBoundsForShard calculates the beginning and end indices for the test
-// based on the TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars. The
-// returned ints are the beginning (inclusive) and end (exclusive) of the
-// subslice corresponding to the shard. If either of the env vars are not
-// present, then the function will return bounds that include all tests. If
-// there are more shards than there are tests, then the returned list may be
-// empty.
-func TestBoundsForShard(numTests int) (int, int, error) {
+// TestIndicesForShard returns indices for this test shard based on the
+// TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars.
+//
+// If either of the env vars are not present, then the function will return all
+// tests. If there are more shards than there are tests, then the returned list
+// may be empty.
+func TestIndicesForShard(numTests int) ([]int, error) {
 	var (
-		begin = 0
-		end   = numTests
+		shardIndex = 0
+		shardTotal = 1
 	)
-	indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS")
-	if indexStr == "" || totalStr == "" {
-		return begin, end, nil
-	}
 
-	// Parse index and total to ints.
-	shardIndex, err := strconv.Atoi(indexStr)
-	if err != nil {
-		return 0, 0, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err)
-	}
-	shardTotal, err := strconv.Atoi(totalStr)
-	if err != nil {
-		return 0, 0, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err)
+	indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS")
+	if indexStr != "" && totalStr != "" {
+		// Parse index and total to ints.
+		var err error
+		shardIndex, err = strconv.Atoi(indexStr)
+		if err != nil {
+			return nil, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err)
+		}
+		shardTotal, err = strconv.Atoi(totalStr)
+		if err != nil {
+			return nil, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err)
+		}
 	}
 
 	// Calculate!
-	shardSize := int(math.Ceil(float64(numTests) / float64(shardTotal)))
-	begin = shardIndex * shardSize
-	end = ((shardIndex + 1) * shardSize)
-	if begin > numTests {
-		// Nothing to run.
-		return 0, 0, nil
-	}
-	if end > numTests {
-		end = numTests
+	var indices []int
+	numBlocks := int(math.Ceil(float64(numTests) / float64(shardTotal)))
+	for i := 0; i < numBlocks; i++ {
+		pick := i*shardTotal + shardIndex
+		if pick < numTests {
+			indices = append(indices, pick)
+		}
 	}
-	return begin, end, nil
+	return indices, nil
 }
diff --git a/test/runtimes/runner.go b/test/runtimes/runner.go
index bec37c69d..ddb890dbc 100644
--- a/test/runtimes/runner.go
+++ b/test/runtimes/runner.go
@@ -20,7 +20,6 @@ import (
 	"flag"
 	"fmt"
 	"io"
-	"log"
 	"os"
 	"sort"
 	"strings"
@@ -101,17 +100,15 @@ func getTests(d dockerutil.Docker, blacklist map[string]struct{}) ([]testing.Int
 	// shard.
 	tests := strings.Fields(list)
 	sort.Strings(tests)
-	begin, end, err := testutil.TestBoundsForShard(len(tests))
+	indices, err := testutil.TestIndicesForShard(len(tests))
 	if err != nil {
 		return nil, fmt.Errorf("TestsForShard() failed: %v", err)
 	}
-	log.Printf("Got bounds [%d:%d) for shard out of %d total tests", begin, end, len(tests))
-	tests = tests[begin:end]
 
 	var itests []testing.InternalTest
-	for _, tc := range tests {
+	for _, tci := range indices {
 		// Capture tc in this scope.
-		tc := tc
+		tc := tests[tci]
 		itests = append(itests, testing.InternalTest{
 			Name: tc,
 			F: func(t *testing.T) {
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 3bf7081b9..b24618a88 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -325,6 +325,12 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
+  constexpr int kAcceptCount = 32;
+  constexpr int kBacklog = kAcceptCount * 2;
+  constexpr int kFDs = 128;
+  constexpr int kThreadCount = 4;
+  constexpr int kFDsPerThread = kFDs / kThreadCount;
+
   // Create the listening socket.
   FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
@@ -332,7 +338,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
                    listener.addr_len),
               SyscallSucceeds());
-  ASSERT_THAT(listen(listen_fd.get(), 1001), SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
   socklen_t addrlen = listener.addr_len;
@@ -345,9 +351,6 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   DisableSave ds;  // Too many system calls.
   sockaddr_storage conn_addr = connector.addr;
   ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
-  constexpr int kFDs = 2048;
-  constexpr int kThreadCount = 4;
-  constexpr int kFDsPerThread = kFDs / kThreadCount;
   FileDescriptor clients[kFDs];
   std::unique_ptr<ScopedThread> threads[kThreadCount];
   for (int i = 0; i < kFDs; i++) {
@@ -371,7 +374,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   for (int i = 0; i < kThreadCount; i++) {
     threads[i]->Join();
   }
-  for (int i = 0; i < 32; i++) {
+  for (int i = 0; i < kAcceptCount; i++) {
     auto accepted =
         ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
   }
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index b9fd885ff..ae342b68c 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -450,17 +450,16 @@ func main() {
 	}
 
 	// Get subset of tests corresponding to shard.
-	begin, end, err := testutil.TestBoundsForShard(len(testCases))
+	indices, err := testutil.TestIndicesForShard(len(testCases))
 	if err != nil {
 		fatalf("TestsForShard() failed: %v", err)
 	}
-	testCases = testCases[begin:end]
 
 	// Run the tests.
 	var tests []testing.InternalTest
-	for _, tc := range testCases {
+	for _, tci := range indices {
 		// Capture tc.
-		tc := tc
+		tc := testCases[tci]
 		testName := fmt.Sprintf("%s_%s", tc.Suite, tc.Name)
 		tests = append(tests, testing.InternalTest{
 			Name: testName,
-- 
cgit v1.2.3


From d7cd484091543827678f1548b8e5668a7a86e13f Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 4 Feb 2020 08:20:10 -0800
Subject: Add support for sentry internal pipe for gofer mounts

Internal pipes are supported similarly to how internal UDS is done.
It is also controlled by the same flag.

Fixes #1102

PiperOrigin-RevId: 293150045
---
 pkg/sentry/fs/gofer/BUILD            |   2 +
 pkg/sentry/fs/gofer/cache_policy.go  |   3 +
 pkg/sentry/fs/gofer/fifo.go          |  40 ++++++++
 pkg/sentry/fs/gofer/gofer_test.go    |   2 +-
 pkg/sentry/fs/gofer/path.go          | 183 +++++++++++++++++++++++------------
 pkg/sentry/fs/gofer/session.go       | 183 ++++++++++++++++++++++-------------
 pkg/sentry/fs/gofer/session_state.go |  12 +--
 pkg/sentry/fs/gofer/socket.go        |   8 +-
 test/syscalls/BUILD                  |   1 -
 9 files changed, 293 insertions(+), 141 deletions(-)
 create mode 100644 pkg/sentry/fs/gofer/fifo.go

(limited to 'test/syscalls')

diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index 971d3718e..fea135eea 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -9,6 +9,7 @@ go_library(
         "cache_policy.go",
         "context_file.go",
         "device.go",
+        "fifo.go",
         "file.go",
         "file_state.go",
         "fs.go",
@@ -38,6 +39,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/host",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index ebea03c42..07a564e92 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -127,6 +127,9 @@ func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child
 
 	childIops, ok := child.InodeOperations.(*inodeOperations)
 	if !ok {
+		if _, ok := child.InodeOperations.(*fifo); ok {
+			return false
+		}
 		panic(fmt.Sprintf("revalidating inode operations of unknown type %T", child.InodeOperations))
 	}
 	parentIops, ok := parent.InodeOperations.(*inodeOperations)
diff --git a/pkg/sentry/fs/gofer/fifo.go b/pkg/sentry/fs/gofer/fifo.go
new file mode 100644
index 000000000..456557058
--- /dev/null
+++ b/pkg/sentry/fs/gofer/fifo.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+)
+
+// +stateify savable
+type fifo struct {
+	fs.InodeOperations
+	fileIops *inodeOperations
+}
+
+var _ fs.InodeOperations = (*fifo)(nil)
+
+// Rename implements fs.InodeOperations. It forwards the call to the underlying
+// file inode to handle the file rename. Note that file key remains the same
+// after the rename to keep the endpoint mapping.
+func (i *fifo) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+	return i.fileIops.Rename(ctx, inode, oldParent, oldName, newParent, newName, replacement)
+}
+
+// StatFS implements fs.InodeOperations.
+func (i *fifo) StatFS(ctx context.Context) (fs.Info, error) {
+	return i.fileIops.StatFS(ctx)
+}
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 0c2f89ae8..2df2fe889 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -61,7 +61,7 @@ func rootTest(t *testing.T, name string, cp cachePolicy, fn func(context.Context
 		ctx := contexttest.Context(t)
 		sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{
 			file: rootFile,
-		}, root.QID, p9.AttrMaskAll(), root.Attr, false /* socket */)
+		}, root.QID, p9.AttrMaskAll(), root.Attr)
 		m := fs.NewMountSource(ctx, s, &filesystem{}, fs.MountSourceFlags{})
 		rootInode := fs.NewInode(ctx, rootInodeOperations, m, sattr)
 
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 0c1be05ef..a35c3a23d 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -23,14 +23,24 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/device"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // maxFilenameLen is the maximum length of a filename. This is dictated by 9P's
 // encoding of strings, which uses 2 bytes for the length prefix.
 const maxFilenameLen = (1 << 16) - 1
 
+func changeType(mode p9.FileMode, newType p9.FileMode) p9.FileMode {
+	if newType&^p9.FileModeMask != 0 {
+		panic(fmt.Sprintf("newType contained more bits than just file mode: %x", newType))
+	}
+	clear := mode &^ p9.FileModeMask
+	return clear | newType
+}
+
 // Lookup loads an Inode at name into a Dirent based on the session's cache
 // policy.
 func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
@@ -69,8 +79,25 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 		return nil, err
 	}
 
+	if i.session().overrides != nil {
+		// Check if file belongs to a internal named pipe. Note that it doesn't need
+		// to check for sockets because it's done in newInodeOperations below.
+		deviceKey := device.MultiDeviceKey{
+			Device:          p9attr.RDev,
+			SecondaryDevice: i.session().connID,
+			Inode:           qids[0].Path,
+		}
+		unlock := i.session().overrides.lock()
+		if pipeInode := i.session().overrides.getPipe(deviceKey); pipeInode != nil {
+			unlock()
+			pipeInode.IncRef()
+			return fs.NewDirent(ctx, pipeInode, name), nil
+		}
+		unlock()
+	}
+
 	// Construct the Inode operations.
-	sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr, false)
+	sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr)
 
 	// Construct a positive Dirent.
 	return fs.NewDirent(ctx, fs.NewInode(ctx, node, dir.MountSource, sattr), name), nil
@@ -138,7 +165,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 	qid := qids[0]
 
 	// Construct the InodeOperations.
-	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr, false)
+	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr)
 
 	// Construct the positive Dirent.
 	d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
@@ -223,82 +250,115 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 		return nil, syserror.ENAMETOOLONG
 	}
 
-	if i.session().endpoints == nil {
+	if i.session().overrides == nil {
 		return nil, syscall.EOPNOTSUPP
 	}
 
-	// Create replaces the directory fid with the newly created/opened
-	// file, so clone this directory so it doesn't change out from under
-	// this node.
-	_, newFile, err := i.fileState.file.walk(ctx, nil)
+	// Stabilize the override map while creation is in progress.
+	unlock := i.session().overrides.lock()
+	defer unlock()
+
+	sattr, iops, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeSocket)
 	if err != nil {
 		return nil, err
 	}
-	// We're not going to use newFile after return.
-	defer newFile.close(ctx)
 
-	// Stabilize the endpoint map while creation is in progress.
-	unlock := i.session().endpoints.lock()
-	defer unlock()
+	// Construct the positive Dirent.
+	childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
+	i.session().overrides.addBoundEndpoint(iops.fileState.key, childDir, ep)
+	return childDir, nil
+}
 
-	// Create a regular file in the gofer and then mark it as a socket by
-	// adding this inode key in the 'endpoints' map.
-	owner := fs.FileOwnerFromContext(ctx)
-	hostFile, err := newFile.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
-	if err != nil {
-		return nil, err
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+	if len(name) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
 	}
-	// We're not going to use this file.
-	hostFile.Close()
 
-	i.touchModificationAndStatusChangeTime(ctx, dir)
+	owner := fs.FileOwnerFromContext(ctx)
+	mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe
 
-	// Get the attributes of the file to create inode key.
-	qid, mask, attr, err := getattr(ctx, newFile)
-	if err != nil {
-		return nil, err
+	// N.B. FIFOs use major/minor numbers 0.
+	if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
+		if i.session().overrides == nil || err != syscall.EPERM {
+			return err
+		}
+		// If gofer doesn't support mknod, check if we can create an internal fifo.
+		return i.createInternalFifo(ctx, dir, name, owner, perm)
 	}
 
-	key := device.MultiDeviceKey{
-		Device:          attr.RDev,
-		SecondaryDevice: i.session().connID,
-		Inode:           qid.Path,
+	i.touchModificationAndStatusChangeTime(ctx, dir)
+	return nil
+}
+
+func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, name string, owner fs.FileOwner, perm fs.FilePermissions) error {
+	if i.session().overrides == nil {
+		return syserror.EPERM
 	}
 
-	// Create child dirent.
+	// Stabilize the override map while creation is in progress.
+	unlock := i.session().overrides.lock()
+	defer unlock()
 
-	// Get an unopened p9.File for the file we created so that it can be
-	// cloned and re-opened multiple times after creation.
-	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
+	sattr, fileOps, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeNamedPipe)
 	if err != nil {
-		return nil, err
+		return err
 	}
 
-	// Construct the InodeOperations.
-	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr, true)
+	// First create a pipe.
+	p := pipe.NewPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)
+
+	// Wrap the fileOps with our Fifo.
+	iops := &fifo{
+		InodeOperations: pipe.NewInodeOperations(ctx, perm, p),
+		fileIops:        fileOps,
+	}
+	inode := fs.NewInode(ctx, iops, dir.MountSource, sattr)
 
 	// Construct the positive Dirent.
 	childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name)
-	i.session().endpoints.add(key, childDir, ep)
-	return childDir, nil
+	i.session().overrides.addPipe(fileOps.fileState.key, childDir, inode)
+	return nil
 }
 
-// CreateFifo implements fs.InodeOperations.CreateFifo.
-func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
-	if len(name) > maxFilenameLen {
-		return syserror.ENAMETOOLONG
+// Caller must hold Session.endpoint lock.
+func (i *inodeOperations) createEndpointFile(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions, fileType p9.FileMode) (fs.StableAttr, *inodeOperations, error) {
+	_, dirClone, err := i.fileState.file.walk(ctx, nil)
+	if err != nil {
+		return fs.StableAttr{}, nil, err
 	}
+	// We're not going to use dirClone after return.
+	defer dirClone.close(ctx)
 
+	// Create a regular file in the gofer and then mark it as a socket by
+	// adding this inode key in the 'overrides' map.
 	owner := fs.FileOwnerFromContext(ctx)
-	mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe
-
-	// N.B. FIFOs use major/minor numbers 0.
-	if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
-		return err
+	hostFile, err := dirClone.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
+	if err != nil {
+		return fs.StableAttr{}, nil, err
 	}
+	// We're not going to use this file.
+	hostFile.Close()
 
 	i.touchModificationAndStatusChangeTime(ctx, dir)
-	return nil
+
+	// Get the attributes of the file to create inode key.
+	qid, mask, attr, err := getattr(ctx, dirClone)
+	if err != nil {
+		return fs.StableAttr{}, nil, err
+	}
+
+	// Get an unopened p9.File for the file we created so that it can be
+	// cloned and re-opened multiple times after creation.
+	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
+	if err != nil {
+		return fs.StableAttr{}, nil, err
+	}
+
+	// Construct new inode with file type overridden.
+	attr.Mode = changeType(attr.Mode, fileType)
+	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr)
+	return sattr, iops, nil
 }
 
 // Remove implements InodeOperations.Remove.
@@ -307,20 +367,23 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 		return syserror.ENAMETOOLONG
 	}
 
-	var key device.MultiDeviceKey
-	removeSocket := false
-	if i.session().endpoints != nil {
-		// Find out if file being deleted is a socket that needs to be
+	var key *device.MultiDeviceKey
+	if i.session().overrides != nil {
+		// Find out if file being deleted is a socket or pipe that needs to be
 		// removed from endpoint map.
 		if d, err := i.Lookup(ctx, dir, name); err == nil {
 			defer d.DecRef()
-			if fs.IsSocket(d.Inode.StableAttr) {
-				child := d.Inode.InodeOperations.(*inodeOperations)
-				key = child.fileState.key
-				removeSocket = true
 
-				// Stabilize the endpoint map while deletion is in progress.
-				unlock := i.session().endpoints.lock()
+			if fs.IsSocket(d.Inode.StableAttr) || fs.IsPipe(d.Inode.StableAttr) {
+				switch iops := d.Inode.InodeOperations.(type) {
+				case *inodeOperations:
+					key = &iops.fileState.key
+				case *fifo:
+					key = &iops.fileIops.fileState.key
+				}
+
+				// Stabilize the override map while deletion is in progress.
+				unlock := i.session().overrides.lock()
 				defer unlock()
 			}
 		}
@@ -329,8 +392,8 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 	if err := i.fileState.file.unlinkAt(ctx, name, 0); err != nil {
 		return err
 	}
-	if removeSocket {
-		i.session().endpoints.remove(key)
+	if key != nil {
+		i.session().overrides.remove(*key)
 	}
 	i.touchModificationAndStatusChangeTime(ctx, dir)
 
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 498c4645a..f6b3ef178 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -33,60 +33,107 @@ import (
 var DefaultDirentCacheSize uint64 = fs.DefaultDirentCacheSize
 
 // +stateify savable
-type endpointMaps struct {
-	// mu protexts the direntMap, the keyMap, and the pathMap below.
-	mu sync.RWMutex `state:"nosave"`
+type overrideInfo struct {
+	dirent *fs.Dirent
+
+	// endpoint is set when dirent points to a socket. inode must not be set.
+	endpoint transport.BoundEndpoint
+
+	// inode is set when dirent points to a pipe. endpoint must not be set.
+	inode *fs.Inode
+}
 
-	// direntMap links sockets to their dirents.
-	// It is filled concurrently with the keyMap and is stored upon save.
-	// Before saving, this map is used to populate the pathMap.
-	direntMap map[transport.BoundEndpoint]*fs.Dirent
+func (l *overrideInfo) inodeType() fs.InodeType {
+	switch {
+	case l.endpoint != nil:
+		return fs.Socket
+	case l.inode != nil:
+		return fs.Pipe
+	}
+	panic("endpoint or node must be set")
+}
 
-	// keyMap links MultiDeviceKeys (containing inode IDs) to their sockets.
+// +stateify savable
+type overrideMaps struct {
+	// mu protexts the keyMap, and the pathMap below.
+	mu sync.RWMutex `state:"nosave"`
+
+	// keyMap links MultiDeviceKeys (containing inode IDs) to their sockets/pipes.
 	// It is not stored during save because the inode ID may change upon restore.
-	keyMap map[device.MultiDeviceKey]transport.BoundEndpoint `state:"nosave"`
+	keyMap map[device.MultiDeviceKey]*overrideInfo `state:"nosave"`
 
-	// pathMap links the sockets to their paths.
+	// pathMap links the sockets/pipes to their paths.
 	// It is filled before saving from the direntMap and is stored upon save.
 	// Upon restore, this map is used to re-populate the keyMap.
-	pathMap map[transport.BoundEndpoint]string
+	pathMap map[*overrideInfo]string
+}
+
+// addBoundEndpoint adds the bound endpoint to the map.
+// A reference is taken on the dirent argument.
+//
+// Precondition: maps must have been locked with 'lock'.
+func (e *overrideMaps) addBoundEndpoint(key device.MultiDeviceKey, d *fs.Dirent, ep transport.BoundEndpoint) {
+	d.IncRef()
+	e.keyMap[key] = &overrideInfo{dirent: d, endpoint: ep}
 }
 
-// add adds the endpoint to the maps.
+// addPipe adds the pipe inode to the map.
 // A reference is taken on the dirent argument.
 //
 // Precondition: maps must have been locked with 'lock'.
-func (e *endpointMaps) add(key device.MultiDeviceKey, d *fs.Dirent, ep transport.BoundEndpoint) {
-	e.keyMap[key] = ep
+func (e *overrideMaps) addPipe(key device.MultiDeviceKey, d *fs.Dirent, inode *fs.Inode) {
 	d.IncRef()
-	e.direntMap[ep] = d
+	e.keyMap[key] = &overrideInfo{dirent: d, inode: inode}
 }
 
 // remove deletes the key from the maps.
 //
 // Precondition: maps must have been locked with 'lock'.
-func (e *endpointMaps) remove(key device.MultiDeviceKey) {
-	endpoint := e.get(key)
+func (e *overrideMaps) remove(key device.MultiDeviceKey) {
+	endpoint := e.keyMap[key]
 	delete(e.keyMap, key)
-
-	d := e.direntMap[endpoint]
-	d.DecRef()
-	delete(e.direntMap, endpoint)
+	endpoint.dirent.DecRef()
 }
 
 // lock blocks other addition and removal operations from happening while
 // the backing file is being created or deleted. Returns a function that unlocks
 // the endpoint map.
-func (e *endpointMaps) lock() func() {
+func (e *overrideMaps) lock() func() {
 	e.mu.Lock()
 	return func() { e.mu.Unlock() }
 }
 
-// get returns the endpoint mapped to the given key.
+// getBoundEndpoint returns the bound endpoint mapped to the given key.
 //
-// Precondition: maps must have been locked for reading.
-func (e *endpointMaps) get(key device.MultiDeviceKey) transport.BoundEndpoint {
-	return e.keyMap[key]
+// Precondition: maps must have been locked.
+func (e *overrideMaps) getBoundEndpoint(key device.MultiDeviceKey) transport.BoundEndpoint {
+	if v := e.keyMap[key]; v != nil {
+		return v.endpoint
+	}
+	return nil
+}
+
+// getPipe returns the pipe inode mapped to the given key.
+//
+// Precondition: maps must have been locked.
+func (e *overrideMaps) getPipe(key device.MultiDeviceKey) *fs.Inode {
+	if v := e.keyMap[key]; v != nil {
+		return v.inode
+	}
+	return nil
+}
+
+// getType returns the inode type if there is a corresponding endpoint for the
+// given key. Returns false otherwise.
+func (e *overrideMaps) getType(key device.MultiDeviceKey) (fs.InodeType, bool) {
+	e.mu.Lock()
+	v := e.keyMap[key]
+	e.mu.Unlock()
+
+	if v != nil {
+		return v.inodeType(), true
+	}
+	return 0, false
 }
 
 // session holds state for each 9p session established during sys_mount.
@@ -137,16 +184,16 @@ type session struct {
 	// mounter is the EUID/EGID that mounted this file system.
 	mounter fs.FileOwner `state:"wait"`
 
-	// endpoints is used to map inodes that represent socket files to their
-	// corresponding endpoint. Socket files are created as regular files in the
-	// gofer and their presence in this map indicate that they should indeed be
-	// socket files. This allows unix domain sockets to be used with paths that
-	// belong to a gofer.
+	// overrides is used to map inodes that represent socket/pipes files to their
+	// corresponding endpoint/iops. These files are created as regular files in
+	// the gofer and their presence in this map indicate that they should indeed
+	// be socket/pipe files. This allows unix domain sockets and named pipes to
+	// be used with paths that belong to a gofer.
 	//
 	// TODO(gvisor.dev/issue/1200): there are few possible races with someone
 	// stat'ing the file and another deleting it concurrently, where the file
 	// will not be reported as socket file.
-	endpoints *endpointMaps `state:"wait"`
+	overrides *overrideMaps `state:"wait"`
 }
 
 // Destroy tears down the session.
@@ -179,15 +226,21 @@ func (s *session) SaveInodeMapping(inode *fs.Inode, path string) {
 	// This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
 	// because overlay copyUp may have changed them out from under us.
 	// So much for "immutable".
-	sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
-	s.inodeMappings[sattr.InodeID] = path
+	switch iops := inode.InodeOperations.(type) {
+	case *inodeOperations:
+		s.inodeMappings[iops.fileState.sattr.InodeID] = path
+	case *fifo:
+		s.inodeMappings[iops.fileIops.fileState.sattr.InodeID] = path
+	default:
+		panic(fmt.Sprintf("Invalid type: %T", iops))
+	}
 }
 
-// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File and attributes
-// (p9.QID, p9.AttrMask, p9.Attr).
+// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File
+// and attributes (p9.QID, p9.AttrMask, p9.Attr).
 //
 // Endpoints lock must not be held if socket == false.
-func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr, socket bool) (fs.StableAttr, *inodeOperations) {
+func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr) (fs.StableAttr, *inodeOperations) {
 	deviceKey := device.MultiDeviceKey{
 		Device:          attr.RDev,
 		SecondaryDevice: s.connID,
@@ -201,17 +254,11 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p
 		BlockSize: bsize(attr),
 	}
 
-	if s.endpoints != nil {
-		if socket {
-			sattr.Type = fs.Socket
-		} else {
-			// If unix sockets are allowed on this filesystem, check if this file is
-			// supposed to be a socket file.
-			unlock := s.endpoints.lock()
-			if s.endpoints.get(deviceKey) != nil {
-				sattr.Type = fs.Socket
-			}
-			unlock()
+	if s.overrides != nil && sattr.Type == fs.RegularFile {
+		// If overrides are allowed on this filesystem, check if this file is
+		// supposed to be of a different type, e.g. socket.
+		if t, ok := s.overrides.getType(deviceKey); ok {
+			sattr.Type = t
 		}
 	}
 
@@ -267,7 +314,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 	s.EnableLeakCheck("gofer.session")
 
 	if o.privateunixsocket {
-		s.endpoints = newEndpointMaps()
+		s.overrides = newOverrideMaps()
 	}
 
 	// Construct the MountSource with the session and superBlockFlags.
@@ -305,26 +352,24 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 		return nil, err
 	}
 
-	sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr, false)
+	sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr)
 	return fs.NewInode(ctx, iops, m, sattr), nil
 }
 
-// newEndpointMaps creates a new endpointMaps.
-func newEndpointMaps() *endpointMaps {
-	return &endpointMaps{
-		direntMap: make(map[transport.BoundEndpoint]*fs.Dirent),
-		keyMap:    make(map[device.MultiDeviceKey]transport.BoundEndpoint),
-		pathMap:   make(map[transport.BoundEndpoint]string),
+// newOverrideMaps creates a new overrideMaps.
+func newOverrideMaps() *overrideMaps {
+	return &overrideMaps{
+		keyMap:  make(map[device.MultiDeviceKey]*overrideInfo),
+		pathMap: make(map[*overrideInfo]string),
 	}
 }
 
-// fillKeyMap populates key and dirent maps upon restore from saved
-// pathmap.
+// fillKeyMap populates key and dirent maps upon restore from saved pathmap.
 func (s *session) fillKeyMap(ctx context.Context) error {
-	unlock := s.endpoints.lock()
+	unlock := s.overrides.lock()
 	defer unlock()
 
-	for ep, dirPath := range s.endpoints.pathMap {
+	for ep, dirPath := range s.overrides.pathMap {
 		_, file, err := s.attach.walk(ctx, splitAbsolutePath(dirPath))
 		if err != nil {
 			return fmt.Errorf("error filling endpointmaps, failed to walk to %q: %v", dirPath, err)
@@ -341,25 +386,25 @@ func (s *session) fillKeyMap(ctx context.Context) error {
 			Inode:           qid.Path,
 		}
 
-		s.endpoints.keyMap[key] = ep
+		s.overrides.keyMap[key] = ep
 	}
 	return nil
 }
 
-// fillPathMap populates paths for endpoints from dirents in direntMap
+// fillPathMap populates paths for overrides from dirents in direntMap
 // before save.
 func (s *session) fillPathMap() error {
-	unlock := s.endpoints.lock()
+	unlock := s.overrides.lock()
 	defer unlock()
 
-	for ep, dir := range s.endpoints.direntMap {
-		mountRoot := dir.MountRoot()
+	for _, endpoint := range s.overrides.keyMap {
+		mountRoot := endpoint.dirent.MountRoot()
 		defer mountRoot.DecRef()
-		dirPath, _ := dir.FullName(mountRoot)
+		dirPath, _ := endpoint.dirent.FullName(mountRoot)
 		if dirPath == "" {
 			return fmt.Errorf("error getting path from dirent")
 		}
-		s.endpoints.pathMap[ep] = dirPath
+		s.overrides.pathMap[endpoint] = dirPath
 	}
 	return nil
 }
@@ -368,7 +413,7 @@ func (s *session) fillPathMap() error {
 func (s *session) restoreEndpointMaps(ctx context.Context) error {
 	// When restoring, only need to create the keyMap because the dirent and path
 	// maps got stored through the save.
-	s.endpoints.keyMap = make(map[device.MultiDeviceKey]transport.BoundEndpoint)
+	s.overrides.keyMap = make(map[device.MultiDeviceKey]*overrideInfo)
 	if err := s.fillKeyMap(ctx); err != nil {
 		return fmt.Errorf("failed to insert sockets into endpoint map: %v", err)
 	}
@@ -376,6 +421,6 @@ func (s *session) restoreEndpointMaps(ctx context.Context) error {
 	// Re-create pathMap because it can no longer be trusted as socket paths can
 	// change while process continues to run. Empty pathMap will be re-filled upon
 	// next save.
-	s.endpoints.pathMap = make(map[transport.BoundEndpoint]string)
+	s.overrides.pathMap = make(map[*overrideInfo]string)
 	return nil
 }
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 0285c5361..111da59f9 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -25,9 +25,9 @@ import (
 
 // beforeSave is invoked by stateify.
 func (s *session) beforeSave() {
-	if s.endpoints != nil {
+	if s.overrides != nil {
 		if err := s.fillPathMap(); err != nil {
-			panic("failed to save paths to endpoint map before saving" + err.Error())
+			panic("failed to save paths to override map before saving" + err.Error())
 		}
 	}
 }
@@ -74,10 +74,10 @@ func (s *session) afterLoad() {
 		panic(fmt.Sprintf("new attach name %v, want %v", opts.aname, s.aname))
 	}
 
-	// Check if endpointMaps exist when uds sockets are enabled
-	// (only pathmap will actualy have been saved).
-	if opts.privateunixsocket != (s.endpoints != nil) {
-		panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.endpoints != nil))
+	// Check if overrideMaps exist when uds sockets are enabled (only pathmaps
+	// will actually have been saved).
+	if opts.privateunixsocket != (s.overrides != nil) {
+		panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.overrides != nil))
 	}
 	if args.Flags != s.superBlockFlags {
 		panic(fmt.Sprintf("new mount flags %v, want %v", args.Flags, s.superBlockFlags))
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 376cfce2c..10ba2f5f0 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -32,15 +32,15 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport.
 		return nil
 	}
 
-	if i.session().endpoints != nil {
-		unlock := i.session().endpoints.lock()
+	if i.session().overrides != nil {
+		unlock := i.session().overrides.lock()
 		defer unlock()
-		ep := i.session().endpoints.get(i.fileState.key)
+		ep := i.session().overrides.getBoundEndpoint(i.fileState.key)
 		if ep != nil {
 			return ep
 		}
 
-		// Not found in endpoints map, it may be a gofer backed unix socket...
+		// Not found in overrides map, it may be a gofer backed unix socket...
 	}
 
 	inode.IncRef()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 40e974314..8f2b75a1c 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -225,7 +225,6 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:mknod_test",
-    use_tmpfs = True,  # mknod is not supported over gofer.
 )
 
 syscall_test(
-- 
cgit v1.2.3


From c5d4041623ac6405135e966af6d06c178a86870d Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Tue, 4 Feb 2020 12:53:10 -0800
Subject: Include socket_ip_udp_loopback.cc in exportes_files

So it can be included in fuchsia's syscall tests

PiperOrigin-RevId: 293208306
---
 test/syscalls/linux/BUILD | 1 +
 1 file changed, 1 insertion(+)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e4ca5b6db..737e2329f 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -11,6 +11,7 @@ exports_files(
         "socket_inet_loopback.cc",
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_loopback.cc",
+        "socket_ip_udp_loopback.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
         "udp_socket.cc",
-- 
cgit v1.2.3


From 6823b5e244a5748032130574ae3a25a0a36bbbf5 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 4 Feb 2020 13:05:30 -0800
Subject: timer_create(2) should return 0 on success

The timer ID is copied out to the argument.

Fixes #1738

PiperOrigin-RevId: 293210801
---
 pkg/sentry/syscalls/linux/sys_timer.go |  2 +-
 test/syscalls/linux/timers.cc          | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'test/syscalls')

diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index 432351917..a4c400f87 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -146,7 +146,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 		return 0, nil, err
 	}
 
-	return uintptr(id), nil, nil
+	return 0, nil, nil
 }
 
 // TimerSettime implements linux syscall timer_settime(2).
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
index 3db18d7ac..2f92c27da 100644
--- a/test/syscalls/linux/timers.cc
+++ b/test/syscalls/linux/timers.cc
@@ -297,9 +297,13 @@ class IntervalTimer {
 PosixErrorOr<IntervalTimer> TimerCreate(clockid_t clockid,
                                         const struct sigevent& sev) {
   int timerid;
-  if (syscall(SYS_timer_create, clockid, &sev, &timerid) < 0) {
+  int ret = syscall(SYS_timer_create, clockid, &sev, &timerid);
+  if (ret < 0) {
     return PosixError(errno, "timer_create");
   }
+  if (ret > 0) {
+    return PosixError(EINVAL, "timer_create should never return positive");
+  }
   MaybeSave();
   return IntervalTimer(timerid);
 }
@@ -317,6 +321,18 @@ TEST(IntervalTimerTest, IsInitiallyStopped) {
   EXPECT_EQ(0, its.it_value.tv_nsec);
 }
 
+// Kernel can create multiple timers without issue.
+//
+// Regression test for gvisor.dev/issue/1738.
+TEST(IntervalTimerTest, MultipleTimers) {
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_NONE;
+  const auto timer1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+  const auto timer2 =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+}
+
 TEST(IntervalTimerTest, SingleShotSilent) {
   struct sigevent sev = {};
   sev.sigev_notify = SIGEV_NONE;
-- 
cgit v1.2.3


From a26a954946ad2e7910d3ad7578960a93b73a1f9b Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 4 Feb 2020 15:20:30 -0800
Subject: Add socket connection stress test.

Tests 65k connection attempts on common types of sockets to check for port
leaks.

Also fixes a bug where dual-stack sockets wouldn't properly re-queue
segments received while closing.

PiperOrigin-RevId: 293241166
---
 pkg/tcpip/transport/tcp/connect.go           |   4 ++
 test/syscalls/BUILD                          |   9 +++
 test/syscalls/linux/BUILD                    |  17 +++++
 test/syscalls/linux/ip_socket_test_util.cc   |  27 +++++++
 test/syscalls/linux/ip_socket_test_util.h    |  15 ++++
 test/syscalls/linux/socket_generic_stress.cc |  83 ++++++++++++++++++++++
 test/syscalls/linux/socket_test_util.cc      | 101 ++++++++++++++++++++++++---
 test/syscalls/linux/socket_test_util.h       |   6 ++
 8 files changed, 251 insertions(+), 11 deletions(-)
 create mode 100644 test/syscalls/linux/socket_generic_stress.cc

(limited to 'test/syscalls')

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 9ff7ac261..5c5397823 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -989,6 +989,10 @@ func (e *endpoint) transitionToStateCloseLocked() {
 // to any other listening endpoint. We reply with RST if we cannot find one.
 func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, &s.route)
+	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.EndpointInfo.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
+		// Dual-stack socket, try IPv4.
+		ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, &s.route)
+	}
 	if ep == nil {
 		replyWithReset(s)
 		s.decRef()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 8f2b75a1c..31d239c0e 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -45,6 +45,15 @@ syscall_test(test = "//test/syscalls/linux:brk_test")
 
 syscall_test(test = "//test/syscalls/linux:socket_test")
 
+syscall_test(
+    size = "large",
+    shard_count = 50,
+    # Takes too long for TSAN. Since this is kind of a stress test that doesn't
+    # involve much concurrency, TSAN's usefulness here is limited anyway.
+    tags = ["nogotsan"],
+    test = "//test/syscalls/linux:socket_stress_test",
+)
+
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:chdir_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 737e2329f..273b014d6 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -136,6 +136,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:optional",
         "//test/util:file_descriptor",
         "//test/util:posix_error",
         "//test/util:temp_path",
@@ -2151,6 +2152,22 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_binary(
+    name = "socket_stress_test",
+    testonly = 1,
+    srcs = [
+        "socket_generic_stress.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_library(
     name = "socket_unix_dgram_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 6b472eb2f..bba022a41 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -79,6 +79,33 @@ SocketPairKind DualStackTCPAcceptBindSocketPair(int type) {
                                      /* dual_stack = */ true)};
 }
 
+SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected IPv6 TCP socket");
+  return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET6, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ false)};
+}
+
+SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected IPv4 TCP socket");
+  return SocketPairKind{description, AF_INET, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ false)};
+}
+
+SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected dual stack TCP socket");
+  return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET6, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ true)};
+}
+
 SocketPairKind IPv6UDPBidirectionalBindSocketPair(int type) {
   std::string description =
       absl::StrCat(DescribeSocketType(type), "connected IPv6 UDP socket");
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 0f58e0f77..083ebbcf0 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -50,6 +50,21 @@ SocketPairKind IPv4TCPAcceptBindSocketPair(int type);
 // given type bound to the IPv4 loopback.
 SocketPairKind DualStackTCPAcceptBindSocketPair(int type);
 
+// IPv6TCPAcceptBindPersistentListenerSocketPair is like
+// IPv6TCPAcceptBindSocketPair except it uses a persistent listening socket to
+// create all socket pairs.
+SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type);
+
+// IPv4TCPAcceptBindPersistentListenerSocketPair is like
+// IPv4TCPAcceptBindSocketPair except it uses a persistent listening socket to
+// create all socket pairs.
+SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type);
+
+// DualStackTCPAcceptBindPersistentListenerSocketPair is like
+// DualStackTCPAcceptBindSocketPair except it uses a persistent listening socket
+// to create all socket pairs.
+SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type);
+
 // IPv6UDPBidirectionalBindSocketPair returns a SocketPairKind that represents
 // SocketPairs created with bind() and connect() syscalls with AF_INET6 and the
 // given type bound to the IPv6 loopback.
diff --git a/test/syscalls/linux/socket_generic_stress.cc b/test/syscalls/linux/socket_generic_stress.cc
new file mode 100644
index 000000000..6a232238d
--- /dev/null
+++ b/test/syscalls/linux/socket_generic_stress.cc
@@ -0,0 +1,83 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected sockets.
+using ConnectStressTest = SocketPairTest;
+
+TEST_P(ConnectStressTest, Reset65kTimes) {
+  for (int i = 0; i < 1 << 16; ++i) {
+    auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+    // Send some data to ensure that the connection gets reset and the port gets
+    // released immediately. This avoids either end entering TIME-WAIT.
+    char sent_data[100] = {};
+    ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+                SyscallSucceedsWithValue(sizeof(sent_data)));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllConnectedSockets, ConnectStressTest,
+    ::testing::Values(IPv6UDPBidirectionalBindSocketPair(0),
+                      IPv4UDPBidirectionalBindSocketPair(0),
+                      DualStackUDPBidirectionalBindSocketPair(0),
+
+                      // Without REUSEADDR, we get port exhaustion on Linux.
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR,
+                                 &kSockOptOn)(IPv6TCPAcceptBindSocketPair(0)),
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR,
+                                 &kSockOptOn)(IPv4TCPAcceptBindSocketPair(0)),
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+                          DualStackTCPAcceptBindSocketPair(0))));
+
+// Test fixture for tests that apply to pairs of connected sockets created with
+// a persistent listener (if applicable).
+using PersistentListenerConnectStressTest = SocketPairTest;
+
+TEST_P(PersistentListenerConnectStressTest, 65kTimes) {
+  for (int i = 0; i < 1 << 16; ++i) {
+    auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllConnectedSockets, PersistentListenerConnectStressTest,
+    ::testing::Values(
+        IPv6UDPBidirectionalBindSocketPair(0),
+        IPv4UDPBidirectionalBindSocketPair(0),
+        DualStackUDPBidirectionalBindSocketPair(0),
+
+        // Without REUSEADDR, we get port exhaustion on Linux.
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            IPv6TCPAcceptBindPersistentListenerSocketPair(0)),
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            IPv4TCPAcceptBindPersistentListenerSocketPair(0)),
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            DualStackTCPAcceptBindPersistentListenerSocketPair(0))));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index eff7d577e..c0c5ab3fe 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -18,10 +18,13 @@
 #include <poll.h>
 #include <sys/socket.h>
 
+#include <memory>
+
 #include "gtest/gtest.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/clock.h"
+#include "absl/types/optional.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
@@ -109,7 +112,10 @@ Creator<SocketPair> AcceptBindSocketPairCreator(bool abstract, int domain,
       MaybeSave();  // Unlinked path.
     }
 
-    return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
+    // accepted is before connected to destruct connected before accepted.
+    // Destructors for nonstatic member objects are called in the reverse order
+    // in which they appear in the class declaration.
+    return absl::make_unique<AddrFDSocketPair>(accepted, connected, bind_addr,
                                                extra_addr);
   };
 }
@@ -311,11 +317,16 @@ PosixErrorOr<T> BindIP(int fd, bool dual_stack) {
 }
 
 template <typename T>
-PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
-    int bound, int connected, int type, bool dual_stack) {
-  ASSIGN_OR_RETURN_ERRNO(T bind_addr, BindIP<T>(bound, dual_stack));
-  RETURN_ERROR_IF_SYSCALL_FAIL(listen(bound, /* backlog = */ 5));
+PosixErrorOr<T> TCPBindAndListen(int fd, bool dual_stack) {
+  ASSIGN_OR_RETURN_ERRNO(T addr, BindIP<T>(fd, dual_stack));
+  RETURN_ERROR_IF_SYSCALL_FAIL(listen(fd, /* backlog = */ 5));
+  return addr;
+}
 
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>>
+CreateTCPConnectAcceptSocketPair(int bound, int connected, int type,
+                                 bool dual_stack, T bind_addr) {
   int connect_result = 0;
   RETURN_ERROR_IF_SYSCALL_FAIL(
       (connect_result = RetryEINTR(connect)(
@@ -358,16 +369,27 @@ PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
     absl::SleepFor(absl::Seconds(1));
   }
 
-  // Cleanup no longer needed resources.
-  RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
-  MaybeSave();  // Successful close.
-
   T extra_addr = {};
   LocalhostAddr(&extra_addr, dual_stack);
   return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
                                              extra_addr);
 }
 
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
+    int bound, int connected, int type, bool dual_stack) {
+  ASSIGN_OR_RETURN_ERRNO(T bind_addr, TCPBindAndListen<T>(bound, dual_stack));
+
+  auto result = CreateTCPConnectAcceptSocketPair(bound, connected, type,
+                                                 dual_stack, bind_addr);
+
+  // Cleanup no longer needed resources.
+  RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
+  MaybeSave();  // Successful close.
+
+  return result;
+}
+
 Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
                                                    int protocol,
                                                    bool dual_stack) {
@@ -389,6 +411,63 @@ Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
   };
 }
 
+Creator<SocketPair> TCPAcceptBindPersistentListenerSocketPairCreator(
+    int domain, int type, int protocol, bool dual_stack) {
+  // These are lazily initialized below, on the first call to the returned
+  // lambda. These values are private to each returned lambda, but shared across
+  // invocations of a specific lambda.
+  //
+  // The sharing allows pairs created with the same parameters to share a
+  // listener. This prevents future connects from failing if the connecting
+  // socket selects a port which had previously been used by a listening socket
+  // that still has some connections in TIME-WAIT.
+  //
+  // The lazy initialization is to avoid creating sockets during parameter
+  // enumeration. This is important because parameters are enumerated during the
+  // build process where networking may not be available.
+  auto listener = std::make_shared<absl::optional<int>>(absl::optional<int>());
+  auto addr4 = std::make_shared<absl::optional<sockaddr_in>>(
+      absl::optional<sockaddr_in>());
+  auto addr6 = std::make_shared<absl::optional<sockaddr_in6>>(
+      absl::optional<sockaddr_in6>());
+
+  return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+    int connected;
+    RETURN_ERROR_IF_SYSCALL_FAIL(connected = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+
+    // Share the listener across invocations.
+    if (!listener->has_value()) {
+      int fd = socket(domain, type, protocol);
+      if (fd < 0) {
+        return PosixError(errno, absl::StrCat("socket(", domain, ", ", type,
+                                              ", ", protocol, ")"));
+      }
+      listener->emplace(fd);
+      MaybeSave();  // Successful socket creation.
+    }
+
+    // Bind the listener once, but create a new connect/accept pair each
+    // time.
+    if (domain == AF_INET) {
+      if (!addr4->has_value()) {
+        addr4->emplace(
+            TCPBindAndListen<sockaddr_in>(listener->value(), dual_stack)
+                .ValueOrDie());
+      }
+      return CreateTCPConnectAcceptSocketPair(listener->value(), connected,
+                                              type, dual_stack, addr4->value());
+    }
+    if (!addr6->has_value()) {
+      addr6->emplace(
+          TCPBindAndListen<sockaddr_in6>(listener->value(), dual_stack)
+              .ValueOrDie());
+    }
+    return CreateTCPConnectAcceptSocketPair(listener->value(), connected, type,
+                                            dual_stack, addr6->value());
+  };
+}
+
 template <typename T>
 PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateUDPBoundSocketPair(
     int sock1, int sock2, int type, bool dual_stack) {
@@ -518,8 +597,8 @@ size_t CalculateUnixSockAddrLen(const char* sun_path) {
   if (sun_path[0] == 0) {
     return sizeof(sockaddr_un);
   }
-  // Filesystem addresses use the address length plus the 2 byte sun_family and
-  // null terminator.
+  // Filesystem addresses use the address length plus the 2 byte sun_family
+  // and null terminator.
   return strlen(sun_path) + 3;
 }
 
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index 2dbb8bed3..bfaa6e397 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -273,6 +273,12 @@ Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
                                                    int protocol,
                                                    bool dual_stack);
 
+// TCPAcceptBindPersistentListenerSocketPairCreator is like
+// TCPAcceptBindSocketPairCreator, except it uses the same listening socket to
+// create all SocketPairs.
+Creator<SocketPair> TCPAcceptBindPersistentListenerSocketPairCreator(
+    int domain, int type, int protocol, bool dual_stack);
+
 // UDPBidirectionalBindSocketPairCreator returns a Creator<SocketPair> that
 // obtains file descriptors by invoking the bind() and connect() syscalls on UDP
 // sockets.
-- 
cgit v1.2.3


From 665b614e4a6e715bac25bea15c5c29184016e549 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Tue, 4 Feb 2020 18:04:26 -0800
Subject: Support RTM_NEWADDR and RTM_GETLINK in (rt)netlink.

PiperOrigin-RevId: 293271055
---
 pkg/sentry/inet/inet.go                      |   4 +
 pkg/sentry/inet/test_stack.go                |   6 +
 pkg/sentry/socket/hostinet/stack.go          |   5 +
 pkg/sentry/socket/netlink/BUILD              |  14 +-
 pkg/sentry/socket/netlink/message.go         | 129 +++++++++++
 pkg/sentry/socket/netlink/message_test.go    | 312 +++++++++++++++++++++++++++
 pkg/sentry/socket/netlink/provider.go        |   2 +-
 pkg/sentry/socket/netlink/route/BUILD        |   2 -
 pkg/sentry/socket/netlink/route/protocol.go  | 238 ++++++++++++++------
 pkg/sentry/socket/netlink/socket.go          |  54 ++---
 pkg/sentry/socket/netlink/uevent/protocol.go |   2 +-
 pkg/sentry/socket/netstack/stack.go          |  55 +++++
 pkg/tcpip/stack/stack.go                     |   9 +
 test/syscalls/linux/BUILD                    |   2 +
 test/syscalls/linux/socket_netlink_route.cc  | 296 ++++++++++++++++++++-----
 test/syscalls/linux/socket_netlink_util.cc   |  45 +++-
 test/syscalls/linux/socket_netlink_util.h    |   9 +
 17 files changed, 1022 insertions(+), 162 deletions(-)
 create mode 100644 pkg/sentry/socket/netlink/message_test.go

(limited to 'test/syscalls')

diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index a7dfb78a7..2916a0644 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -28,6 +28,10 @@ type Stack interface {
 	// interface indexes to a slice of associated interface address properties.
 	InterfaceAddrs() map[int32][]InterfaceAddr
 
+	// AddInterfaceAddr adds an address to the network interface identified by
+	// index.
+	AddInterfaceAddr(idx int32, addr InterfaceAddr) error
+
 	// SupportsIPv6 returns true if the stack supports IPv6 connectivity.
 	SupportsIPv6() bool
 
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index dcfcbd97e..d8961fc94 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -47,6 +47,12 @@ func (s *TestStack) InterfaceAddrs() map[int32][]InterfaceAddr {
 	return s.InterfaceAddrsMap
 }
 
+// AddInterfaceAddr implements Stack.AddInterfaceAddr.
+func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error {
+	s.InterfaceAddrsMap[idx] = append(s.InterfaceAddrsMap[idx], addr)
+	return nil
+}
+
 // SupportsIPv6 implements Stack.SupportsIPv6.
 func (s *TestStack) SupportsIPv6() bool {
 	return s.SupportsIPv6Flag
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index 034eca676..a48082631 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -310,6 +310,11 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 	return addrs
 }
 
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	return syserror.EACCES
+}
+
 // SupportsIPv6 implements inet.Stack.SupportsIPv6.
 func (s *Stack) SupportsIPv6() bool {
 	return s.supportsIPv6
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index f8b8e467d..1911cd9b8 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -33,3 +33,15 @@ go_library(
         "//pkg/waiter",
     ],
 )
+
+go_test(
+    name = "netlink_test",
+    size = "small",
+    srcs = [
+        "message_test.go",
+    ],
+    deps = [
+        ":netlink",
+        "//pkg/abi/linux",
+    ],
+)
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index b21e0ca4b..4ea252ccb 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -30,8 +30,16 @@ func alignUp(length int, align uint) int {
 	return (length + int(align) - 1) &^ (int(align) - 1)
 }
 
+// alignPad returns the length of padding required for alignment.
+//
+// Preconditions: align is a power of two.
+func alignPad(length int, align uint) int {
+	return alignUp(length, align) - length
+}
+
 // Message contains a complete serialized netlink message.
 type Message struct {
+	hdr linux.NetlinkMessageHeader
 	buf []byte
 }
 
@@ -40,10 +48,86 @@ type Message struct {
 // The header length will be updated by Finalize.
 func NewMessage(hdr linux.NetlinkMessageHeader) *Message {
 	return &Message{
+		hdr: hdr,
 		buf: binary.Marshal(nil, usermem.ByteOrder, hdr),
 	}
 }
 
+// ParseMessage parses the first message seen at buf, returning the rest of the
+// buffer. If message is malformed, ok of false is returned. For last message,
+// padding check is loose, if there isn't enought padding, whole buf is consumed
+// and ok is set to true.
+func ParseMessage(buf []byte) (msg *Message, rest []byte, ok bool) {
+	b := BytesView(buf)
+
+	hdrBytes, ok := b.Extract(linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return
+	}
+	var hdr linux.NetlinkMessageHeader
+	binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr)
+
+	// Msg portion.
+	totalMsgLen := int(hdr.Length)
+	_, ok = b.Extract(totalMsgLen - linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return
+	}
+
+	// Padding.
+	numPad := alignPad(totalMsgLen, linux.NLMSG_ALIGNTO)
+	// Linux permits the last message not being aligned, just consume all of it.
+	// Ref: net/netlink/af_netlink.c:netlink_rcv_skb
+	if numPad > len(b) {
+		numPad = len(b)
+	}
+	_, ok = b.Extract(numPad)
+	if !ok {
+		return
+	}
+
+	return &Message{
+		hdr: hdr,
+		buf: buf[:totalMsgLen],
+	}, []byte(b), true
+}
+
+// Header returns the header of this message.
+func (m *Message) Header() linux.NetlinkMessageHeader {
+	return m.hdr
+}
+
+// GetData unmarshals the payload message header from this netlink message, and
+// returns the attributes portion.
+func (m *Message) GetData(msg interface{}) (AttrsView, bool) {
+	b := BytesView(m.buf)
+
+	_, ok := b.Extract(linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return nil, false
+	}
+
+	size := int(binary.Size(msg))
+	msgBytes, ok := b.Extract(size)
+	if !ok {
+		return nil, false
+	}
+	binary.Unmarshal(msgBytes, usermem.ByteOrder, msg)
+
+	numPad := alignPad(linux.NetlinkMessageHeaderSize+size, linux.NLMSG_ALIGNTO)
+	// Linux permits the last message not being aligned, just consume all of it.
+	// Ref: net/netlink/af_netlink.c:netlink_rcv_skb
+	if numPad > len(b) {
+		numPad = len(b)
+	}
+	_, ok = b.Extract(numPad)
+	if !ok {
+		return nil, false
+	}
+
+	return AttrsView(b), true
+}
+
 // Finalize returns the []byte containing the entire message, with the total
 // length set in the message header. The Message must not be modified after
 // calling Finalize.
@@ -157,3 +241,48 @@ func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message {
 	ms.Messages = append(ms.Messages, m)
 	return m
 }
+
+// AttrsView is a view into the attributes portion of a netlink message.
+type AttrsView []byte
+
+// Empty returns whether there is no attribute left in v.
+func (v AttrsView) Empty() bool {
+	return len(v) == 0
+}
+
+// ParseFirst parses first netlink attribute at the beginning of v.
+func (v AttrsView) ParseFirst() (hdr linux.NetlinkAttrHeader, value []byte, rest AttrsView, ok bool) {
+	b := BytesView(v)
+
+	hdrBytes, ok := b.Extract(linux.NetlinkAttrHeaderSize)
+	if !ok {
+		return
+	}
+	binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr)
+
+	value, ok = b.Extract(int(hdr.Length) - linux.NetlinkAttrHeaderSize)
+	if !ok {
+		return
+	}
+
+	_, ok = b.Extract(alignPad(int(hdr.Length), linux.NLA_ALIGNTO))
+	if !ok {
+		return
+	}
+
+	return hdr, value, AttrsView(b), ok
+}
+
+// BytesView supports extracting data from a byte slice with bounds checking.
+type BytesView []byte
+
+// Extract removes the first n bytes from v and returns it. If n is out of
+// bounds, it returns false.
+func (v *BytesView) Extract(n int) ([]byte, bool) {
+	if n < 0 || n > len(*v) {
+		return nil, false
+	}
+	extracted := (*v)[:n]
+	*v = (*v)[n:]
+	return extracted, true
+}
diff --git a/pkg/sentry/socket/netlink/message_test.go b/pkg/sentry/socket/netlink/message_test.go
new file mode 100644
index 000000000..ef13d9386
--- /dev/null
+++ b/pkg/sentry/socket/netlink/message_test.go
@@ -0,0 +1,312 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package message_test
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
+)
+
+type dummyNetlinkMsg struct {
+	Foo uint16
+}
+
+func TestParseMessage(t *testing.T) {
+	tests := []struct {
+		desc  string
+		input []byte
+
+		header  linux.NetlinkMessageHeader
+		dataMsg *dummyNetlinkMsg
+		restLen int
+		ok      bool
+	}{
+		{
+			desc: "valid",
+			input: []byte{
+				0x14, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 20,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "valid with next message",
+			input: []byte{
+				0x14, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+				0xFF, // Next message (rest)
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 20,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 1,
+			ok:      true,
+		},
+		{
+			desc: "valid for last message without padding",
+			input: []byte{
+				0x12, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, // Data message
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 18,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "valid for last message not to be aligned",
+			input: []byte{
+				0x13, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, // Data message
+				0x00, // Excessive 1 byte permitted at end
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 19,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "header.Length too short",
+			input: []byte{
+				0x04, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			ok: false,
+		},
+		{
+			desc: "header.Length too long",
+			input: []byte{
+				0xFF, 0xFF, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			ok: false,
+		},
+		{
+			desc: "header incomplete",
+			input: []byte{
+				0x04, 0x00, 0x00, 0x00, // Length
+			},
+			ok: false,
+		},
+		{
+			desc:  "empty message",
+			input: []byte{},
+			ok:    false,
+		},
+	}
+	for _, test := range tests {
+		msg, rest, ok := netlink.ParseMessage(test.input)
+		if ok != test.ok {
+			t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok)
+			continue
+		}
+		if !test.ok {
+			continue
+		}
+		if !reflect.DeepEqual(msg.Header(), test.header) {
+			t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, msg.Header(), test.header)
+		}
+
+		dataMsg := &dummyNetlinkMsg{}
+		_, dataOk := msg.GetData(dataMsg)
+		if !dataOk {
+			t.Errorf("%v: GetData.ok = %v, want = true", test.desc, dataOk)
+		} else if !reflect.DeepEqual(dataMsg, test.dataMsg) {
+			t.Errorf("%v: GetData.msg = %+v, want = %+v", test.desc, dataMsg, test.dataMsg)
+		}
+
+		if got, want := rest, test.input[len(test.input)-test.restLen:]; !bytes.Equal(got, want) {
+			t.Errorf("%v: got rest = %v, want = %v", test.desc, got, want)
+		}
+	}
+}
+
+func TestAttrView(t *testing.T) {
+	tests := []struct {
+		desc  string
+		input []byte
+
+		// Outputs for ParseFirst.
+		hdr     linux.NetlinkAttrHeader
+		value   []byte
+		restLen int
+		ok      bool
+
+		// Outputs for Empty.
+		isEmpty bool
+	}{
+		{
+			desc: "valid",
+			input: []byte{
+				0x06, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x00, 0x00, // Data with 2 bytes padding
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 6,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31},
+			restLen: 0,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "at alignment",
+			input: []byte{
+				0x08, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 8,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31, 0x32, 0x33},
+			restLen: 0,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "at alignment with rest data",
+			input: []byte{
+				0x08, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+				0xFF, 0xFE, // Rest data
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 8,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31, 0x32, 0x33},
+			restLen: 2,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "hdr.Length too long",
+			input: []byte{
+				0xFF, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			ok:      false,
+			isEmpty: false,
+		},
+		{
+			desc: "hdr.Length too short",
+			input: []byte{
+				0x01, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			ok:      false,
+			isEmpty: false,
+		},
+		{
+			desc:    "empty",
+			input:   []byte{},
+			ok:      false,
+			isEmpty: true,
+		},
+	}
+	for _, test := range tests {
+		attrs := netlink.AttrsView(test.input)
+
+		// Test ParseFirst().
+		hdr, value, rest, ok := attrs.ParseFirst()
+		if ok != test.ok {
+			t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok)
+		} else if test.ok {
+			if !reflect.DeepEqual(hdr, test.hdr) {
+				t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, hdr, test.hdr)
+			}
+			if !bytes.Equal(value, test.value) {
+				t.Errorf("%v: got value = %v, want = %v", test.desc, value, test.value)
+			}
+			if wantRest := test.input[len(test.input)-test.restLen:]; !bytes.Equal(rest, wantRest) {
+				t.Errorf("%v: got rest = %v, want = %v", test.desc, rest, wantRest)
+			}
+		}
+
+		// Test Empty().
+		if got, want := attrs.Empty(), test.isEmpty; got != want {
+			t.Errorf("%v: got empty = %v, want = %v", test.desc, got, want)
+		}
+	}
+}
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 07f860a49..b0dc70e5c 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -42,7 +42,7 @@ type Protocol interface {
 	// If err == nil, any messages added to ms will be sent back to the
 	// other end of the socket. Setting ms.Multi will cause an NLMSG_DONE
 	// message to be sent even if ms contains no messages.
-	ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *MessageSet) *syserr.Error
+	ProcessMessage(ctx context.Context, msg *Message, ms *MessageSet) *syserr.Error
 }
 
 // Provider is a function that creates a new Protocol for a specific netlink
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 622a1eafc..93127398d 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -10,13 +10,11 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/netlink",
         "//pkg/syserr",
-        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 2b3c7f5b3..c84d8bd7c 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -17,16 +17,15 @@ package route
 
 import (
 	"bytes"
+	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // commandKind describes the operational class of a message type.
@@ -69,13 +68,7 @@ func (p *Protocol) CanSend() bool {
 }
 
 // dumpLinks handles RTM_GETLINK dump requests.
-func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
-	}
-
+func (p *Protocol) dumpLinks(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// NLM_F_DUMP + RTM_GETLINK messages are supposed to include an
 	// ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some
 	// userspace applications (including glibc) still include rtgenmsg.
@@ -99,44 +92,105 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 		return nil
 	}
 
-	for id, i := range stack.Interfaces() {
-		m := ms.AddMessage(linux.NetlinkMessageHeader{
-			Type: linux.RTM_NEWLINK,
-		})
+	for idx, i := range stack.Interfaces() {
+		addNewLinkMessage(ms, idx, i)
+	}
 
-		m.Put(linux.InterfaceInfoMessage{
-			Family: linux.AF_UNSPEC,
-			Type:   i.DeviceType,
-			Index:  id,
-			Flags:  i.Flags,
-		})
+	return nil
+}
 
-		m.PutAttrString(linux.IFLA_IFNAME, i.Name)
-		m.PutAttr(linux.IFLA_MTU, i.MTU)
+// getLinks handles RTM_GETLINK requests.
+func (p *Protocol) getLink(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network devices.
+		return nil
+	}
 
-		mac := make([]byte, 6)
-		brd := mac
-		if len(i.Addr) > 0 {
-			mac = i.Addr
-			brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
+	// Parse message.
+	var ifi linux.InterfaceInfoMessage
+	attrs, ok := msg.GetData(&ifi)
+	if !ok {
+		return syserr.ErrInvalidArgument
+	}
+
+	// Parse attributes.
+	var byName []byte
+	for !attrs.Empty() {
+		ahdr, value, rest, ok := attrs.ParseFirst()
+		if !ok {
+			return syserr.ErrInvalidArgument
 		}
-		m.PutAttr(linux.IFLA_ADDRESS, mac)
-		m.PutAttr(linux.IFLA_BROADCAST, brd)
+		attrs = rest
 
-		// TODO(gvisor.dev/issue/578): There are many more attributes.
+		switch ahdr.Type {
+		case linux.IFLA_IFNAME:
+			if len(value) < 1 {
+				return syserr.ErrInvalidArgument
+			}
+			byName = value[:len(value)-1]
+
+			// TODO(gvisor.dev/issue/578): Support IFLA_EXT_MASK.
+		}
 	}
 
+	found := false
+	for idx, i := range stack.Interfaces() {
+		switch {
+		case ifi.Index > 0:
+			if idx != ifi.Index {
+				continue
+			}
+		case byName != nil:
+			if string(byName) != i.Name {
+				continue
+			}
+		default:
+			// Criteria not specified.
+			return syserr.ErrInvalidArgument
+		}
+
+		addNewLinkMessage(ms, idx, i)
+		found = true
+		break
+	}
+	if !found {
+		return syserr.ErrNoDevice
+	}
 	return nil
 }
 
-// dumpAddrs handles RTM_GETADDR dump requests.
-func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
+// addNewLinkMessage appends RTM_NEWLINK message for the given interface into
+// the message set.
+func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) {
+	m := ms.AddMessage(linux.NetlinkMessageHeader{
+		Type: linux.RTM_NEWLINK,
+	})
+
+	m.Put(linux.InterfaceInfoMessage{
+		Family: linux.AF_UNSPEC,
+		Type:   i.DeviceType,
+		Index:  idx,
+		Flags:  i.Flags,
+	})
+
+	m.PutAttrString(linux.IFLA_IFNAME, i.Name)
+	m.PutAttr(linux.IFLA_MTU, i.MTU)
+
+	mac := make([]byte, 6)
+	brd := mac
+	if len(i.Addr) > 0 {
+		mac = i.Addr
+		brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
 	}
+	m.PutAttr(linux.IFLA_ADDRESS, mac)
+	m.PutAttr(linux.IFLA_BROADCAST, brd)
+
+	// TODO(gvisor.dev/issue/578): There are many more attributes.
+}
 
+// dumpAddrs handles RTM_GETADDR dump requests.
+func (p *Protocol) dumpAddrs(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETADDR dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -168,6 +222,7 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 				Index:     uint32(id),
 			})
 
+			m.PutAttr(linux.IFA_LOCAL, []byte(a.Addr))
 			m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr))
 
 			// TODO(gvisor.dev/issue/578): There are many more attributes.
@@ -252,12 +307,12 @@ func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) {
 }
 
 // parseForDestination parses a message as format of RouteMessage-RtAttr-dst.
-func parseForDestination(data []byte) ([]byte, *syserr.Error) {
+func parseForDestination(msg *netlink.Message) ([]byte, *syserr.Error) {
 	var rtMsg linux.RouteMessage
-	if len(data) < linux.SizeOfRouteMessage {
+	attrs, ok := msg.GetData(&rtMsg)
+	if !ok {
 		return nil, syserr.ErrInvalidArgument
 	}
-	binary.Unmarshal(data[:linux.SizeOfRouteMessage], usermem.ByteOrder, &rtMsg)
 	// iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See
 	// commit bc234301af12. Note we don't check this flag for backward
 	// compatibility.
@@ -265,26 +320,15 @@ func parseForDestination(data []byte) ([]byte, *syserr.Error) {
 		return nil, syserr.ErrNotSupported
 	}
 
-	data = data[linux.SizeOfRouteMessage:]
-
-	// TODO(gvisor.dev/issue/1611): Add generic attribute parsing.
-	var rtAttr linux.RtAttr
-	if len(data) < linux.SizeOfRtAttr {
-		return nil, syserr.ErrInvalidArgument
+	// Expect first attribute is RTA_DST.
+	if hdr, value, _, ok := attrs.ParseFirst(); ok && hdr.Type == linux.RTA_DST {
+		return value, nil
 	}
-	binary.Unmarshal(data[:linux.SizeOfRtAttr], usermem.ByteOrder, &rtAttr)
-	if rtAttr.Type != linux.RTA_DST {
-		return nil, syserr.ErrInvalidArgument
-	}
-
-	if len(data) < int(rtAttr.Len) {
-		return nil, syserr.ErrInvalidArgument
-	}
-	return data[linux.SizeOfRtAttr:rtAttr.Len], nil
+	return nil, syserr.ErrInvalidArgument
 }
 
 // dumpRoutes handles RTM_GETROUTE requests.
-func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) dumpRoutes(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETROUTE dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -295,10 +339,11 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade
 		return nil
 	}
 
+	hdr := msg.Header()
 	routeTables := stack.RouteTable()
 
 	if hdr.Flags == linux.NLM_F_REQUEST {
-		dst, err := parseForDestination(data)
+		dst, err := parseForDestination(msg)
 		if err != nil {
 			return err
 		}
@@ -357,10 +402,55 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade
 	return nil
 }
 
+// newAddr handles RTM_NEWADDR requests.
+func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network stack.
+		return syserr.ErrProtocolNotSupported
+	}
+
+	var ifa linux.InterfaceAddrMessage
+	attrs, ok := msg.GetData(&ifa)
+	if !ok {
+		return syserr.ErrInvalidArgument
+	}
+
+	for !attrs.Empty() {
+		ahdr, value, rest, ok := attrs.ParseFirst()
+		if !ok {
+			return syserr.ErrInvalidArgument
+		}
+		attrs = rest
+
+		switch ahdr.Type {
+		case linux.IFA_LOCAL:
+			err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{
+				Family:    ifa.Family,
+				PrefixLen: ifa.PrefixLen,
+				Flags:     ifa.Flags,
+				Addr:      value,
+			})
+			if err == syscall.EEXIST {
+				flags := msg.Header().Flags
+				if flags&linux.NLM_F_EXCL != 0 {
+					return syserr.ErrExists
+				}
+			} else if err != nil {
+				return syserr.ErrInvalidArgument
+			}
+		}
+	}
+	return nil
+}
+
 // ProcessMessage implements netlink.Protocol.ProcessMessage.
-func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	hdr := msg.Header()
+
 	// All messages start with a 1 byte protocol family.
-	if len(data) < 1 {
+	var family uint8
+	if _, ok := msg.GetData(&family); !ok {
 		// Linux ignores messages missing the protocol family. See
 		// net/core/rtnetlink.c:rtnetlink_rcv_msg.
 		return nil
@@ -374,16 +464,32 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH
 		}
 	}
 
-	switch hdr.Type {
-	case linux.RTM_GETLINK:
-		return p.dumpLinks(ctx, hdr, data, ms)
-	case linux.RTM_GETADDR:
-		return p.dumpAddrs(ctx, hdr, data, ms)
-	case linux.RTM_GETROUTE:
-		return p.dumpRoutes(ctx, hdr, data, ms)
-	default:
-		return syserr.ErrNotSupported
+	if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP {
+		// TODO(b/68878065): Only the dump variant of the types below are
+		// supported.
+		switch hdr.Type {
+		case linux.RTM_GETLINK:
+			return p.dumpLinks(ctx, msg, ms)
+		case linux.RTM_GETADDR:
+			return p.dumpAddrs(ctx, msg, ms)
+		case linux.RTM_GETROUTE:
+			return p.dumpRoutes(ctx, msg, ms)
+		default:
+			return syserr.ErrNotSupported
+		}
+	} else if hdr.Flags&linux.NLM_F_REQUEST == linux.NLM_F_REQUEST {
+		switch hdr.Type {
+		case linux.RTM_GETLINK:
+			return p.getLink(ctx, msg, ms)
+		case linux.RTM_GETROUTE:
+			return p.dumpRoutes(ctx, msg, ms)
+		case linux.RTM_NEWADDR:
+			return p.newAddr(ctx, msg, ms)
+		default:
+			return syserr.ErrNotSupported
+		}
 	}
+	return syserr.ErrNotSupported
 }
 
 // init registers the NETLINK_ROUTE provider.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index c4b95debb..2ca02567d 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -644,47 +644,38 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 	return nil
 }
 
-func (s *Socket) dumpErrorMesage(ctx context.Context, hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) *syserr.Error {
+func dumpErrorMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) {
 	m := ms.AddMessage(linux.NetlinkMessageHeader{
 		Type: linux.NLMSG_ERROR,
 	})
-
 	m.Put(linux.NetlinkErrorMessage{
 		Error:  int32(-err.ToLinux().Number()),
 		Header: hdr,
 	})
-	return nil
+}
 
+func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) {
+	m := ms.AddMessage(linux.NetlinkMessageHeader{
+		Type: linux.NLMSG_ERROR,
+	})
+	m.Put(linux.NetlinkErrorMessage{
+		Error:  0,
+		Header: hdr,
+	})
 }
 
 // processMessages handles each message in buf, passing it to the protocol
 // handler for final handling.
 func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error {
 	for len(buf) > 0 {
-		if len(buf) < linux.NetlinkMessageHeaderSize {
+		msg, rest, ok := ParseMessage(buf)
+		if !ok {
 			// Linux ignores messages that are too short. See
 			// net/netlink/af_netlink.c:netlink_rcv_skb.
 			break
 		}
-
-		var hdr linux.NetlinkMessageHeader
-		binary.Unmarshal(buf[:linux.NetlinkMessageHeaderSize], usermem.ByteOrder, &hdr)
-
-		if hdr.Length < linux.NetlinkMessageHeaderSize || uint64(hdr.Length) > uint64(len(buf)) {
-			// Linux ignores malformed messages. See
-			// net/netlink/af_netlink.c:netlink_rcv_skb.
-			break
-		}
-
-		// Data from this message.
-		data := buf[linux.NetlinkMessageHeaderSize:hdr.Length]
-
-		// Advance to the next message.
-		next := alignUp(int(hdr.Length), linux.NLMSG_ALIGNTO)
-		if next >= len(buf)-1 {
-			next = len(buf) - 1
-		}
-		buf = buf[next:]
+		buf = rest
+		hdr := msg.Header()
 
 		// Ignore control messages.
 		if hdr.Type < linux.NLMSG_MIN_TYPE {
@@ -692,19 +683,10 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
 		}
 
 		ms := NewMessageSet(s.portID, hdr.Seq)
-		var err *syserr.Error
-		// TODO(b/68877377): ACKs not supported yet.
-		if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
-			err = syserr.ErrNotSupported
-		} else {
-
-			err = s.protocol.ProcessMessage(ctx, hdr, data, ms)
-		}
-		if err != nil {
-			ms = NewMessageSet(s.portID, hdr.Seq)
-			if err := s.dumpErrorMesage(ctx, hdr, ms, err); err != nil {
-				return err
-			}
+		if err := s.protocol.ProcessMessage(ctx, msg, ms); err != nil {
+			dumpErrorMesage(hdr, ms, err)
+		} else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
+			dumpAckMesage(hdr, ms)
 		}
 
 		if err := s.sendResponse(ctx, ms); err != nil {
diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go
index 1ee4296bc..029ba21b5 100644
--- a/pkg/sentry/socket/netlink/uevent/protocol.go
+++ b/pkg/sentry/socket/netlink/uevent/protocol.go
@@ -49,7 +49,7 @@ func (p *Protocol) CanSend() bool {
 }
 
 // ProcessMessage implements netlink.Protocol.ProcessMessage.
-func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// Silently ignore all messages.
 	return nil
 }
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index 31ea66eca..0692482e9 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -20,6 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -88,6 +90,59 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 	return nicAddrs
 }
 
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	var (
+		protocol tcpip.NetworkProtocolNumber
+		address  tcpip.Address
+	)
+	switch addr.Family {
+	case linux.AF_INET:
+		if len(addr.Addr) < header.IPv4AddressSize {
+			return syserror.EINVAL
+		}
+		if addr.PrefixLen > header.IPv4AddressSize*8 {
+			return syserror.EINVAL
+		}
+		protocol = ipv4.ProtocolNumber
+		address = tcpip.Address(addr.Addr[:header.IPv4AddressSize])
+
+	case linux.AF_INET6:
+		if len(addr.Addr) < header.IPv6AddressSize {
+			return syserror.EINVAL
+		}
+		if addr.PrefixLen > header.IPv6AddressSize*8 {
+			return syserror.EINVAL
+		}
+		protocol = ipv6.ProtocolNumber
+		address = tcpip.Address(addr.Addr[:header.IPv6AddressSize])
+
+	default:
+		return syserror.ENOTSUP
+	}
+
+	protocolAddress := tcpip.ProtocolAddress{
+		Protocol: protocol,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   address,
+			PrefixLen: int(addr.PrefixLen),
+		},
+	}
+
+	// Attach address to interface.
+	if err := s.Stack.AddProtocolAddressWithOptions(tcpip.NICID(idx), protocolAddress, stack.CanBePrimaryEndpoint); err != nil {
+		return syserr.TranslateNetstackError(err).ToError()
+	}
+
+	// Add route for local network.
+	s.Stack.AddRoute(tcpip.Route{
+		Destination: protocolAddress.AddressWithPrefix.Subnet(),
+		Gateway:     "", // No gateway for local network.
+		NIC:         tcpip.NICID(idx),
+	})
+	return nil
+}
+
 // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
 func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
 	var rs tcp.ReceiveBufferSizeOption
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 7057b110e..b793f1d74 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -795,6 +795,8 @@ func (s *Stack) Forwarding() bool {
 
 // SetRouteTable assigns the route table to be used by this stack. It
 // specifies which NIC to use for given destination address ranges.
+//
+// This method takes ownership of the table.
 func (s *Stack) SetRouteTable(table []tcpip.Route) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -809,6 +811,13 @@ func (s *Stack) GetRouteTable() []tcpip.Route {
 	return append([]tcpip.Route(nil), s.routeTable...)
 }
 
+// AddRoute appends a route to the route table.
+func (s *Stack) AddRoute(route tcpip.Route) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.routeTable = append(s.routeTable, route)
+}
+
 // NewEndpoint creates a new transport layer endpoint of the given protocol.
 func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	t, ok := s.transportProtocols[transport]
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 273b014d6..f2e3c7072 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2769,9 +2769,11 @@ cc_binary(
     deps = [
         ":socket_netlink_util",
         ":socket_test_util",
+        "//test/util:capability_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
         gtest,
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 1e28e658d..e5aed1eec 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -14,6 +14,7 @@
 
 #include <arpa/inet.h>
 #include <ifaddrs.h>
+#include <linux/if.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 #include <sys/socket.h>
@@ -25,8 +26,10 @@
 
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/optional.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
 #include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/test_util.h"
@@ -38,6 +41,8 @@ namespace testing {
 
 namespace {
 
+constexpr uint32_t kSeq = 12345;
+
 using ::testing::AnyOf;
 using ::testing::Eq;
 
@@ -113,58 +118,224 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
   // TODO(mpratt): Check ifinfomsg contents and following attrs.
 }
 
+PosixError DumpLinks(
+    const FileDescriptor& fd, uint32_t seq,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = seq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
+}
+
 TEST(NetlinkRouteTest, GetLinkDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
+  // Loopback is common among all tests, check that it's found.
+  bool loopbackFound = false;
+  ASSERT_NO_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    CheckGetLinkResponse(hdr, kSeq, port);
+    if (hdr->nlmsg_type != RTM_NEWLINK) {
+      return;
+    }
+    ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    std::cout << "Found interface idx=" << msg->ifi_index
+              << ", type=" << std::hex << msg->ifi_type;
+    if (msg->ifi_type == ARPHRD_LOOPBACK) {
+      loopbackFound = true;
+      EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
+    }
+  }));
+  EXPECT_TRUE(loopbackFound);
+}
+
+struct Link {
+  int index;
+  std::string name;
+};
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  absl::optional<Link> link;
+  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    if (hdr->nlmsg_type != RTM_NEWLINK ||
+        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
+      return;
+    }
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    if (msg->ifi_type == ARPHRD_LOOPBACK) {
+      const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+      if (rta == nullptr) {
+        // Ignore links that do not have a name.
+        return;
+      }
+
+      link = Link();
+      link->index = msg->ifi_index;
+      link->name = std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
+    }
+  }));
+  return link;
+}
+
+// CheckLinkMsg checks a netlink message against an expected link.
+void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
+  ASSERT_THAT(hdr->nlmsg_type, Eq(RTM_NEWLINK));
+  ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+  const struct ifinfomsg* msg =
+      reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+  EXPECT_EQ(msg->ifi_index, link.index);
+
+  const struct rtattr* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+  EXPECT_NE(nullptr, rta) << "IFLA_IFNAME not found in message.";
+  if (rta != nullptr) {
+    std::string name(reinterpret_cast<const char*>(RTA_DATA(rta)));
+    EXPECT_EQ(name, link.name);
+  }
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndex) {
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
+  req.ifm.ifi_index = loopback_link->index;
 
-  // Loopback is common among all tests, check that it's found.
-  bool loopbackFound = false;
+  bool found = false;
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
       [&](const struct nlmsghdr* hdr) {
-        CheckGetLinkResponse(hdr, kSeq, port);
-        if (hdr->nlmsg_type != RTM_NEWLINK) {
-          return;
-        }
-        ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
-        const struct ifinfomsg* msg =
-            reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
-        std::cout << "Found interface idx=" << msg->ifi_index
-                  << ", type=" << std::hex << msg->ifi_type;
-        if (msg->ifi_type == ARPHRD_LOOPBACK) {
-          loopbackFound = true;
-          EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
-        }
+        CheckLinkMsg(hdr, *loopback_link);
+        found = true;
       },
       false));
-  EXPECT_TRUE(loopbackFound);
+  EXPECT_TRUE(found) << "Netlink response does not contain any links.";
 }
 
-TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
+TEST(NetlinkRouteTest, GetLinkByName) {
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
+    struct rtattr rtattr;
+    char ifname[IFNAMSIZ];
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
   };
 
-  constexpr uint32_t kSeq = 12345;
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.rtattr.rta_type = IFLA_IFNAME;
+  req.rtattr.rta_len = RTA_LENGTH(loopback_link->name.size() + 1);
+  strncpy(req.ifname, loopback_link->name.c_str(), sizeof(req.ifname));
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  bool found = false;
+  ASSERT_NO_ERRNO(NetlinkRequestResponse(
+      fd, &req, sizeof(req),
+      [&](const struct nlmsghdr* hdr) {
+        CheckLinkMsg(hdr, *loopback_link);
+        found = true;
+      },
+      false));
+  EXPECT_TRUE(found) << "Netlink response does not contain any links.";
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndexNotFound) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.ifm.ifi_index = 1234590;
+
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, GetLinkByNameNotFound) {
+  const std::string name = "nodevice?!";
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+    struct rtattr rtattr;
+    char ifname[IFNAMSIZ];
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.rtattr.rta_type = IFLA_IFNAME;
+  req.rtattr.rta_len = RTA_LENGTH(name.size() + 1);
+  strncpy(req.ifname, name.c_str(), sizeof(req.ifname));
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -175,18 +346,8 @@ TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
 
-  ASSERT_NO_ERRNO(NetlinkRequestResponse(
-      fd, &req, sizeof(req),
-      [&](const struct nlmsghdr* hdr) {
-        EXPECT_THAT(hdr->nlmsg_type, Eq(NLMSG_ERROR));
-        EXPECT_EQ(hdr->nlmsg_seq, kSeq);
-        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr));
-
-        const struct nlmsgerr* msg =
-            reinterpret_cast<const struct nlmsgerr*>(NLMSG_DATA(hdr));
-        EXPECT_EQ(msg->error, -EOPNOTSUPP);
-      },
-      true));
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(EOPNOTSUPP, ::testing::_));
 }
 
 TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
@@ -198,8 +359,6 @@ TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
@@ -238,8 +397,6 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
@@ -282,8 +439,6 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
 
   // This control message is ignored. We still receive a response for the
@@ -317,8 +472,6 @@ TEST(NetlinkRouteTest, GetAddrDump) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -367,6 +520,57 @@ TEST(NetlinkRouteTest, LookupAll) {
   ASSERT_GT(count, 0);
 }
 
+TEST(NetlinkRouteTest, AddAddr) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifaddrmsg ifa;
+    struct rtattr rtattr;
+    struct in_addr addr;
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_NEWADDR;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifa.ifa_family = AF_INET;
+  req.ifa.ifa_prefixlen = 24;
+  req.ifa.ifa_flags = 0;
+  req.ifa.ifa_scope = 0;
+  req.ifa.ifa_index = loopback_link->index;
+  req.rtattr.rta_type = IFA_LOCAL;
+  req.rtattr.rta_len = RTA_LENGTH(sizeof(req.addr));
+  inet_pton(AF_INET, "10.0.0.1", &req.addr);
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifa)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  // Create should succeed, as no such address in kernel.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
+  EXPECT_NO_ERRNO(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+  // Replace an existing address should succeed.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_REPLACE | NLM_F_ACK;
+  req.hdr.nlmsg_seq++;
+  EXPECT_NO_ERRNO(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+  // Create exclusive should fail, as we created the address above.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL | NLM_F_ACK;
+  req.hdr.nlmsg_seq++;
+  EXPECT_THAT(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len),
+      PosixErrorIs(EEXIST, ::testing::_));
+}
+
 // GetRouteDump tests a RTM_GETROUTE + NLM_F_DUMP request.
 TEST(NetlinkRouteTest, GetRouteDump) {
   FileDescriptor fd =
@@ -378,8 +582,6 @@ TEST(NetlinkRouteTest, GetRouteDump) {
     struct rtmsg rtm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETROUTE;
@@ -538,8 +740,6 @@ TEST(NetlinkRouteTest, RecvmsgTrunc) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -615,8 +815,6 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -695,8 +893,6 @@ TEST(NetlinkRouteTest, NoPasscredNoCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -743,8 +939,6 @@ TEST(NetlinkRouteTest, PasscredCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index cd2212a1a..952eecfe8 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -16,6 +16,7 @@
 
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <linux/rtnetlink.h>
 #include <sys/socket.h>
 
 #include <vector>
@@ -71,9 +72,10 @@ PosixError NetlinkRequestResponse(
   iov.iov_base = buf.data();
   iov.iov_len = buf.size();
 
-  // Response is a series of NLM_F_MULTI messages, ending with a NLMSG_DONE
-  // message.
+  // If NLM_F_MULTI is set, response is a series of messages that ends with a
+  // NLMSG_DONE message.
   int type = -1;
+  int flags = 0;
   do {
     int len;
     RETURN_ERROR_IF_SYSCALL_FAIL(len = RetryEINTR(recvmsg)(fd.get(), &msg, 0));
@@ -89,6 +91,7 @@ PosixError NetlinkRequestResponse(
     for (struct nlmsghdr* hdr = reinterpret_cast<struct nlmsghdr*>(buf.data());
          NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
       fn(hdr);
+      flags = hdr->nlmsg_flags;
       type = hdr->nlmsg_type;
       // Done should include an integer payload for dump_done_errno.
       // See net/netlink/af_netlink.c:netlink_dump
@@ -98,11 +101,11 @@ PosixError NetlinkRequestResponse(
         EXPECT_GE(hdr->nlmsg_len, NLMSG_LENGTH(sizeof(int)));
       }
     }
-  } while (type != NLMSG_DONE && type != NLMSG_ERROR);
+  } while ((flags & NLM_F_MULTI) && type != NLMSG_DONE && type != NLMSG_ERROR);
 
   if (expect_nlmsgerr) {
     EXPECT_EQ(type, NLMSG_ERROR);
-  } else {
+  } else if (flags & NLM_F_MULTI) {
     EXPECT_EQ(type, NLMSG_DONE);
   }
   return NoError();
@@ -146,5 +149,39 @@ PosixError NetlinkRequestResponseSingle(
   return NoError();
 }
 
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+                                    void* request, size_t len) {
+  // Dummy negative number for no error message received.
+  // We won't get a negative error number so there will be no confusion.
+  int err = -42;
+  RETURN_IF_ERRNO(NetlinkRequestResponse(
+      fd, request, len,
+      [&](const struct nlmsghdr* hdr) {
+        EXPECT_EQ(NLMSG_ERROR, hdr->nlmsg_type);
+        EXPECT_EQ(hdr->nlmsg_seq, seq);
+        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr));
+
+        const struct nlmsgerr* msg =
+            reinterpret_cast<const struct nlmsgerr*>(NLMSG_DATA(hdr));
+        err = -msg->error;
+      },
+      true));
+  return PosixError(err);
+}
+
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+                                const struct ifinfomsg* msg, int16_t attr) {
+  const int ifi_space = NLMSG_SPACE(sizeof(*msg));
+  int attrlen = hdr->nlmsg_len - ifi_space;
+  const struct rtattr* rta = reinterpret_cast<const struct rtattr*>(
+      reinterpret_cast<const uint8_t*>(hdr) + NLMSG_ALIGN(ifi_space));
+  for (; RTA_OK(rta, attrlen); rta = RTA_NEXT(rta, attrlen)) {
+    if (rta->rta_type == attr) {
+      return rta;
+    }
+  }
+  return nullptr;
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index 3678c0599..e13ead406 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -19,6 +19,7 @@
 // socket.h has to be included before if_arp.h.
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <linux/rtnetlink.h>
 
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
@@ -47,6 +48,14 @@ PosixError NetlinkRequestResponseSingle(
     const FileDescriptor& fd, void* request, size_t len,
     const std::function<void(const struct nlmsghdr* hdr)>& fn);
 
+// Send the passed request then expect and return an ack or error.
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+                                    void* request, size_t len);
+
+// Find rtnetlink attribute in message.
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+                                const struct ifinfomsg* msg, int16_t attr);
+
 }  // namespace testing
 }  // namespace gvisor
 
-- 
cgit v1.2.3


From eea0eeee933ba8406ae688fce4348271f9513514 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 5 Feb 2020 11:25:10 -0800
Subject: Disable get/set xattrs until list/remove exist too.

PiperOrigin-RevId: 293411655
---
 pkg/sentry/syscalls/linux/linux64_amd64.go |  27 ++++---
 pkg/sentry/syscalls/linux/linux64_arm64.go |  37 +++++----
 test/syscalls/linux/xattr.cc               | 124 +++++++++++++++++++++++++++++
 3 files changed, 159 insertions(+), 29 deletions(-)

(limited to 'test/syscalls')

diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 7435b50bf..588f8b087 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,18 +228,21 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
-		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
-		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
-		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
-		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		// TODO(b/148303075): Enable set/getxattr (in their various
+		// forms) once we also have list and removexattr. The JVM
+		// assumes that if get/set exist, then list and remove do too.
+		188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
 		200: syscalls.Supported("tkill", Tkill),
 		201: syscalls.Supported("time", Time),
 		202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 03a39fe65..06e5ee401 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -36,23 +36,26 @@ var ARM64 = &kernel.SyscallTable{
 	},
 	AuditNumber: linux.AUDIT_ARCH_AARCH64,
 	Table: map[uintptr]kernel.Syscall{
-		0:   syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		1:   syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
-		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
-		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
-		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
-		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		12:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		// TODO(b/148303075): Enable set/getxattr (in their various
+		// forms) once we also have list and removexattr. The JVM
+		// assumes that if get/set exist, then list and remove do too.
+		5:   syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		6:   syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		7:   syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		8:   syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		13:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
 		17:  syscalls.Supported("getcwd", Getcwd),
 		18:  syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
 		19:  syscalls.Supported("eventfd2", Eventfd2),
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index ab21d68c6..85eb31847 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -39,6 +39,10 @@ namespace {
 class XattrTest : public FileTest {};
 
 TEST_F(XattrTest, XattrNullName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
@@ -48,6 +52,10 @@ TEST_F(XattrTest, XattrNullName) {
 }
 
 TEST_F(XattrTest, XattrEmptyName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
@@ -56,6 +64,10 @@ TEST_F(XattrTest, XattrEmptyName) {
 }
 
 TEST_F(XattrTest, XattrLargeName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
@@ -77,6 +89,10 @@ TEST_F(XattrTest, XattrLargeName) {
 }
 
 TEST_F(XattrTest, XattrInvalidPrefix) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   std::string name(XATTR_NAME_MAX, 'a');
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
@@ -88,6 +104,10 @@ TEST_F(XattrTest, XattrInvalidPrefix) {
 // Do not allow save/restore cycles after making the test file read-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -113,6 +133,10 @@ TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
 // Do not allow save/restore cycles after making the test file write-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -143,6 +167,10 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) {
 }
 
 TEST_F(XattrTest, XattrOnDirectory) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
@@ -152,6 +180,10 @@ TEST_F(XattrTest, XattrOnDirectory) {
 }
 
 TEST_F(XattrTest, XattrOnSymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
@@ -163,6 +195,10 @@ TEST_F(XattrTest, XattrOnSymlink) {
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char name[] = "user.test";
 
   char char_device[] = "/dev/zero";
@@ -181,6 +217,10 @@ TEST_F(XattrTest, XattrOnInvalidFileTypes) {
 }
 
 TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -196,6 +236,10 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, SetxattrZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -208,6 +252,10 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
 
@@ -223,6 +271,10 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
@@ -232,6 +284,10 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -240,6 +296,10 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
@@ -256,6 +316,10 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -271,6 +335,10 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithLarger) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -285,6 +353,10 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
 }
 
 TEST_F(XattrTest, SetxattrCreateFlag) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -296,6 +368,10 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceFlag) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
@@ -308,6 +384,10 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
 }
 
 TEST_F(XattrTest, SetxattrInvalidFlags) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
@@ -315,6 +395,10 @@ TEST_F(XattrTest, SetxattrInvalidFlags) {
 }
 
 TEST_F(XattrTest, Getxattr) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -327,6 +411,10 @@ TEST_F(XattrTest, Getxattr) {
 }
 
 TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -339,6 +427,10 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -354,6 +446,10 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -367,6 +463,10 @@ TEST_F(XattrTest, GetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrSizeTooLarge) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -383,6 +483,10 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, GetxattrNullValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -394,6 +498,10 @@ TEST_F(XattrTest, GetxattrNullValue) {
 }
 
 TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -410,12 +518,20 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrNonexistentName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, LGetSetxattrOnSymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
@@ -427,6 +543,10 @@ TEST_F(XattrTest, LGetSetxattrOnSymlink) {
 }
 
 TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -441,6 +561,10 @@ TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
 }
 
 TEST_F(XattrTest, FGetSetxattr) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
   const char name[] = "user.test";
-- 
cgit v1.2.3


From f3d95607036b8a502c65aa7b3e8145227274dbbc Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 5 Feb 2020 17:56:00 -0800
Subject: recv() on a closed TCP socket returns ENOTCONN

From RFC 793 s3.9 p58 Event Processing:

If RECEIVE Call arrives in CLOSED state and the user has access to such a
connection, the return should be "error: connection does not exist"

Fixes #1598

PiperOrigin-RevId: 293494287
---
 pkg/sentry/socket/netstack/netstack.go | 7 ++++++-
 pkg/tcpip/tcpip.go                     | 4 ++++
 pkg/tcpip/transport/tcp/endpoint.go    | 4 ++--
 pkg/tcpip/transport/tcp/tcp_test.go    | 9 ++++-----
 test/syscalls/linux/tcp_socket.cc      | 9 +++++++++
 5 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'test/syscalls')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 049d04bf2..ed2fbcceb 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -2229,11 +2229,16 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 	var copied int
 
 	// Copy as many views as possible into the user-provided buffer.
-	for dst.NumBytes() != 0 {
+	for {
+		// Always do at least one fetchReadView, even if the number of bytes to
+		// read is 0.
 		err = s.fetchReadView()
 		if err != nil {
 			break
 		}
+		if dst.NumBytes() == 0 {
+			break
+		}
 
 		var n int
 		var e error
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 0fa141d58..d29d9a704 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1124,6 +1124,10 @@ type ReadErrors struct {
 	// InvalidEndpointState is the number of times we found the endpoint state
 	// to be unexpected.
 	InvalidEndpointState StatCounter
+
+	// NotConnected is the number of times we tried to read but found that the
+	// endpoint was not connected.
+	NotConnected StatCounter
 }
 
 // WriteErrors collects packet write errors from an endpoint write call.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index b5a8e15ee..e4a6b1b8b 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1003,8 +1003,8 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 		if s == StateError {
 			return buffer.View{}, tcpip.ControlMessages{}, he
 		}
-		e.stats.ReadErrors.InvalidEndpointState.Increment()
-		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
+		e.stats.ReadErrors.NotConnected.Increment()
+		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected
 	}
 
 	v, err := e.readLocked()
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 2c1505067..cc118c993 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -5405,12 +5405,11 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
 
-	// Expect InvalidEndpointState errors on a read at this point.
-	if _, _, err := ep.Read(nil); err != tcpip.ErrInvalidEndpointState {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrInvalidEndpointState)
+	if _, _, err := ep.Read(nil); err != tcpip.ErrNotConnected {
+		t.Errorf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrNotConnected)
 	}
-	if got := ep.Stats().(*tcp.Stats).ReadErrors.InvalidEndpointState.Value(); got != 1 {
-		t.Fatalf("got EP stats Stats.ReadErrors.InvalidEndpointState got %v want %v", got, 1)
+	if got := ep.Stats().(*tcp.Stats).ReadErrors.NotConnected.Value(); got != 1 {
+		t.Errorf("got EP stats Stats.ReadErrors.NotConnected got %v want %v", got, 1)
 	}
 
 	if err := ep.Listen(10); err != nil {
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 525ccbd88..8a8b68e75 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1339,6 +1339,15 @@ TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) {
   EXPECT_EQ(get, kTCPDeferAccept);
 }
 
+TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) {
+  auto s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  char buf[1];
+  EXPECT_THAT(recv(s.get(), buf, 0, 0), SyscallFailsWithErrno(ENOTCONN));
+  EXPECT_THAT(recv(s.get(), buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 1b6a12a768216a99a5e0428c42ea4faf79cf3b50 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 5 Feb 2020 22:45:44 -0800
Subject: Add notes to relevant tests.

These were out-of-band notes that can help provide additional context
and simplify automated imports.

PiperOrigin-RevId: 293525915
---
 pkg/metric/metric.go                          |  1 -
 pkg/sentry/arch/arch_x86.go                   |  4 ++
 pkg/sentry/arch/signal_amd64.go               |  2 +-
 pkg/sentry/fs/file_overlay_test.go            |  1 +
 pkg/sentry/fs/proc/README.md                  |  4 ++
 pkg/sentry/kernel/BUILD                       |  1 +
 pkg/sentry/kernel/kernel.go                   |  3 ++
 pkg/sentry/kernel/kernel_opts.go              | 20 +++++++
 pkg/sentry/socket/hostinet/BUILD              |  1 +
 pkg/sentry/socket/hostinet/socket.go          |  5 +-
 pkg/sentry/socket/hostinet/sockopt_impl.go    | 27 ++++++++++
 pkg/tcpip/transport/tcp/endpoint.go           |  3 ++
 runsc/boot/filter/BUILD                       |  1 +
 runsc/boot/filter/config.go                   | 13 -----
 runsc/boot/filter/config_profile.go           | 34 ++++++++++++
 runsc/container/console_test.go               |  5 +-
 runsc/dockerutil/dockerutil.go                | 11 ++--
 runsc/testutil/BUILD                          |  5 +-
 runsc/testutil/testutil.go                    | 54 -------------------
 runsc/testutil/testutil_runfiles.go           | 75 +++++++++++++++++++++++++++
 test/image/image_test.go                      |  8 +--
 test/syscalls/build_defs.bzl                  | 35 +++++++++++--
 test/syscalls/linux/chroot.cc                 |  2 +-
 test/syscalls/linux/concurrency.cc            |  3 +-
 test/syscalls/linux/exec_proc_exe_workload.cc |  6 +++
 test/syscalls/linux/fork.cc                   |  5 +-
 test/syscalls/linux/mmap.cc                   |  8 +--
 test/syscalls/linux/open_create.cc            |  1 +
 test/syscalls/linux/preadv.cc                 |  1 +
 test/syscalls/linux/proc.cc                   | 46 +++++++++++++---
 test/syscalls/linux/readv.cc                  |  4 +-
 test/syscalls/linux/rseq.cc                   |  2 +-
 test/syscalls/linux/select.cc                 |  2 +-
 test/syscalls/linux/shm.cc                    |  2 +-
 test/syscalls/linux/sigprocmask.cc            |  2 +-
 test/syscalls/linux/socket_unix_non_stream.cc |  4 +-
 test/syscalls/linux/symlink.cc                |  2 +-
 test/syscalls/linux/tcp_socket.cc             |  3 +-
 test/syscalls/linux/time.cc                   |  1 +
 test/syscalls/linux/tkill.cc                  |  2 +-
 test/util/temp_path.cc                        |  1 +
 tools/build/tags.bzl                          |  4 ++
 tools/defs.bzl                                | 17 +++++-
 43 files changed, 318 insertions(+), 113 deletions(-)
 create mode 100644 pkg/sentry/kernel/kernel_opts.go
 create mode 100644 pkg/sentry/socket/hostinet/sockopt_impl.go
 create mode 100644 runsc/boot/filter/config_profile.go
 create mode 100644 runsc/testutil/testutil_runfiles.go

(limited to 'test/syscalls')

diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 93d4f2b8c..006fcd9ab 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -46,7 +46,6 @@ var (
 //
 // TODO(b/67298402): Support non-cumulative metrics.
 // TODO(b/67298427): Support metric fields.
-//
 type Uint64Metric struct {
 	// value is the actual value of the metric. It must be accessed
 	// atomically.
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index a18093155..3db8bd34b 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -114,6 +114,10 @@ func newX86FPStateSlice() []byte {
 	size, align := cpuid.HostFeatureSet().ExtendedStateSize()
 	capacity := size
 	// Always use at least 4096 bytes.
+	//
+	// For the KVM platform, this state is a fixed 4096 bytes, so make sure
+	// that the underlying array is at _least_ that size otherwise we will
+	// corrupt random memory. This is not a pleasant thing to debug.
 	if capacity < 4096 {
 		capacity = 4096
 	}
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 81b92bb43..6fb756f0e 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -55,7 +55,7 @@ type SignalContext64 struct {
 	Trapno  uint64
 	Oldmask linux.SignalSet
 	Cr2     uint64
-	// Pointer to a struct _fpstate.
+	// Pointer to a struct _fpstate. See b/33003106#comment8.
 	Fpstate  uint64
 	Reserved [8]uint64
 }
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index 02538bb4f..a76d87e3a 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -177,6 +177,7 @@ func TestReaddirRevalidation(t *testing.T) {
 
 // TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with
 // a frozen dirent tree does not make Readdir calls to the underlying files.
+// This is a regression test for b/114808269.
 func TestReaddirOverlayFrozen(t *testing.T) {
 	ctx := contexttest.Context(t)
 
diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index 5d4ec6c7b..6667a0916 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -11,6 +11,8 @@ inconsistency, please file a bug.
 
 The following files are implemented:
 
+<!-- mdformat off(don't wrap the table) -->
+
 | File /proc/                 | Content                                               |
 | :------------------------   | :---------------------------------------------------- |
 | [cpuinfo](#cpuinfo)         | Info about the CPU                                    |
@@ -22,6 +24,8 @@ The following files are implemented:
 | [uptime](#uptime)           | Wall clock since boot, combined idle time of all cpus |
 | [version](#version)         | Kernel version                                        |
 
+<!-- mdformat on -->
+
 ### cpuinfo
 
 ```bash
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index a27628c0a..2231d6973 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -91,6 +91,7 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
+        "kernel_opts.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index dcd6e91c4..3ee760ba2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -235,6 +235,9 @@ type Kernel struct {
 	// events. This is initialized lazily on the first unimplemented
 	// syscall.
 	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
+
+	// SpecialOpts contains special kernel options.
+	SpecialOpts
 }
 
 // InitKernelArgs holds arguments to Init.
diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go
new file mode 100644
index 000000000..2e66ec587
--- /dev/null
+++ b/pkg/sentry/kernel/kernel_opts.go
@@ -0,0 +1,20 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// SpecialOpts contains non-standard options for the kernel.
+//
+// +stateify savable
+type SpecialOpts struct{}
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 5a07d5d0e..023bad156 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -10,6 +10,7 @@ go_library(
         "save_restore.go",
         "socket.go",
         "socket_unsafe.go",
+        "sockopt_impl.go",
         "stack.go",
     ],
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 34f63986f..de76388ac 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -285,7 +285,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	}
 
 	// Whitelist options and constrain option length.
-	var optlen int
+	optlen := getSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
 		switch name {
@@ -330,7 +330,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 // SetSockOpt implements socket.Socket.SetSockOpt.
 func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
 	// Whitelist options and constrain option length.
-	var optlen int
+	optlen := setSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
 		switch name {
@@ -353,6 +353,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 			optlen = sizeofInt32
 		}
 	}
+
 	if optlen == 0 {
 		// Pretend to accept socket options we don't understand. This seems
 		// dangerous, but it's what netstack does...
diff --git a/pkg/sentry/socket/hostinet/sockopt_impl.go b/pkg/sentry/socket/hostinet/sockopt_impl.go
new file mode 100644
index 000000000..8a783712e
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/sockopt_impl.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostinet
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+func getSockOptLen(t *kernel.Task, level, name int) int {
+	return 0 // No custom options.
+}
+
+func setSockOptLen(t *kernel.Task, level, name int) int {
+	return 0 // No custom options.
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index e4a6b1b8b..f2be0e651 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2166,6 +2166,9 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	e.isRegistered = true
 	e.setEndpointState(StateListen)
 
+	// The channel may be non-nil when we're restoring the endpoint, and it
+	// may be pre-populated with some previously accepted (but not Accepted)
+	// endpoints.
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index ce30f6c53..ed18f0047 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -8,6 +8,7 @@ go_library(
         "config.go",
         "config_amd64.go",
         "config_arm64.go",
+        "config_profile.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index f8d351c7b..c69f4c602 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -536,16 +536,3 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 		},
 	}
 }
-
-// profileFilters returns extra syscalls made by runtime/pprof package.
-func profileFilters() seccomp.SyscallRules {
-	return seccomp.SyscallRules{
-		syscall.SYS_OPENAT: []seccomp.Rule{
-			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
-			},
-		},
-	}
-}
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
new file mode 100644
index 000000000..194952a7b
--- /dev/null
+++ b/runsc/boot/filter/config_profile.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_OPENAT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+			},
+		},
+	}
+}
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 060b63bf3..c2518d52b 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -196,7 +196,10 @@ func TestJobControlSignalExec(t *testing.T) {
 	defer ptyMaster.Close()
 	defer ptySlave.Close()
 
-	// Exec bash and attach a terminal.
+	// Exec bash and attach a terminal. Note that occasionally /bin/sh
+	// may be a different shell or have a different configuration (such
+	// as disabling interactive mode and job control). Since we want to
+	// explicitly test interactive mode, use /bin/bash. See b/116981926.
 	execArgs := &control.ExecArgs{
 		Filename: "/bin/bash",
 		// Don't let bash execute from profile or rc files, otherwise
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
index 9b6346ca2..1ff5e8cc3 100644
--- a/runsc/dockerutil/dockerutil.go
+++ b/runsc/dockerutil/dockerutil.go
@@ -143,8 +143,11 @@ func PrepareFiles(names ...string) (string, error) {
 		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
 	}
 	for _, name := range names {
-		src := getLocalPath(name)
-		dst := path.Join(dir, name)
+		src, err := testutil.FindFile(name)
+		if err != nil {
+			return "", fmt.Errorf("testutil.Preparefiles(%q) failed: %v", name, err)
+		}
+		dst := path.Join(dir, path.Base(name))
 		if err := testutil.Copy(src, dst); err != nil {
 			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
 		}
@@ -152,10 +155,6 @@ func PrepareFiles(names ...string) (string, error) {
 	return dir, nil
 }
 
-func getLocalPath(file string) string {
-	return path.Join(".", file)
-}
-
 // do executes docker command.
 func do(args ...string) (string, error) {
 	log.Printf("Running: docker %s\n", args)
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index f845120b0..945405303 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -5,7 +5,10 @@ package(licenses = ["notice"])
 go_library(
     name = "testutil",
     testonly = 1,
-    srcs = ["testutil.go"],
+    srcs = [
+        "testutil.go",
+        "testutil_runfiles.go",
+    ],
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index edf2e809a..80c2c9680 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -79,60 +79,6 @@ func ConfigureExePath() error {
 	return nil
 }
 
-// FindFile searchs for a file inside the test run environment. It returns the
-// full path to the file. It fails if none or more than one file is found.
-func FindFile(path string) (string, error) {
-	wd, err := os.Getwd()
-	if err != nil {
-		return "", err
-	}
-
-	// The test root is demarcated by a path element called "__main__". Search for
-	// it backwards from the working directory.
-	root := wd
-	for {
-		dir, name := filepath.Split(root)
-		if name == "__main__" {
-			break
-		}
-		if len(dir) == 0 {
-			return "", fmt.Errorf("directory __main__ not found in %q", wd)
-		}
-		// Remove ending slash to loop around.
-		root = dir[:len(dir)-1]
-	}
-
-	// Annoyingly, bazel adds the build type to the directory path for go
-	// binaries, but not for c++ binaries. We use two different patterns to
-	// to find our file.
-	patterns := []string{
-		// Try the obvious path first.
-		filepath.Join(root, path),
-		// If it was a go binary, use a wildcard to match the build
-		// type. The pattern is: /test-path/__main__/directories/*/file.
-		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
-	}
-
-	for _, p := range patterns {
-		matches, err := filepath.Glob(p)
-		if err != nil {
-			// "The only possible returned error is ErrBadPattern,
-			// when pattern is malformed." -godoc
-			return "", fmt.Errorf("error globbing %q: %v", p, err)
-		}
-		switch len(matches) {
-		case 0:
-			// Try the next pattern.
-		case 1:
-			// We found it.
-			return matches[0], nil
-		default:
-			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
-		}
-	}
-	return "", fmt.Errorf("file %q not found", path)
-}
-
 // TestConfig returns the default configuration to use in tests. Note that
 // 'RootDir' must be set by caller if required.
 func TestConfig() *boot.Config {
diff --git a/runsc/testutil/testutil_runfiles.go b/runsc/testutil/testutil_runfiles.go
new file mode 100644
index 000000000..ece9ea9a1
--- /dev/null
+++ b/runsc/testutil/testutil_runfiles.go
@@ -0,0 +1,75 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+// FindFile searchs for a file inside the test run environment. It returns the
+// full path to the file. It fails if none or more than one file is found.
+func FindFile(path string) (string, error) {
+	wd, err := os.Getwd()
+	if err != nil {
+		return "", err
+	}
+
+	// The test root is demarcated by a path element called "__main__". Search for
+	// it backwards from the working directory.
+	root := wd
+	for {
+		dir, name := filepath.Split(root)
+		if name == "__main__" {
+			break
+		}
+		if len(dir) == 0 {
+			return "", fmt.Errorf("directory __main__ not found in %q", wd)
+		}
+		// Remove ending slash to loop around.
+		root = dir[:len(dir)-1]
+	}
+
+	// Annoyingly, bazel adds the build type to the directory path for go
+	// binaries, but not for c++ binaries. We use two different patterns to
+	// to find our file.
+	patterns := []string{
+		// Try the obvious path first.
+		filepath.Join(root, path),
+		// If it was a go binary, use a wildcard to match the build
+		// type. The pattern is: /test-path/__main__/directories/*/file.
+		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
+	}
+
+	for _, p := range patterns {
+		matches, err := filepath.Glob(p)
+		if err != nil {
+			// "The only possible returned error is ErrBadPattern,
+			// when pattern is malformed." -godoc
+			return "", fmt.Errorf("error globbing %q: %v", p, err)
+		}
+		switch len(matches) {
+		case 0:
+			// Try the next pattern.
+		case 1:
+			// We found it.
+			return matches[0], nil
+		default:
+			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
+		}
+	}
+	return "", fmt.Errorf("file %q not found", path)
+}
diff --git a/test/image/image_test.go b/test/image/image_test.go
index d0dcb1861..0a1e19d6f 100644
--- a/test/image/image_test.go
+++ b/test/image/image_test.go
@@ -107,7 +107,7 @@ func TestHttpd(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("http-test")
 
-	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -139,7 +139,7 @@ func TestNginx(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("net-test")
 
-	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -183,7 +183,7 @@ func TestMysql(t *testing.T) {
 	}
 
 	client := dockerutil.MakeDocker("mysql-client-test")
-	dir, err := dockerutil.PrepareFiles("mysql.sql")
+	dir, err := dockerutil.PrepareFiles("test/image/mysql.sql")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -283,7 +283,7 @@ func TestRuby(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("ruby-test")
 
-	dir, err := dockerutil.PrepareFiles("ruby.rb", "ruby.sh")
+	dir, err := dockerutil.PrepareFiles("test/image/ruby.rb", "test/image/ruby.sh")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index 1df761dd0..cbab85ef7 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -2,8 +2,6 @@
 
 load("//tools:defs.bzl", "loopback")
 
-# syscall_test is a macro that will create targets to run the given test target
-# on the host (native) and runsc.
 def syscall_test(
         test,
         shard_count = 5,
@@ -13,6 +11,19 @@ def syscall_test(
         add_uds_tree = False,
         add_hostinet = False,
         tags = None):
+    """syscall_test is a macro that will create targets for all platforms.
+
+    Args:
+      test: the test target.
+      shard_count: shards for defined tests.
+      size: the defined test size.
+      use_tmpfs: use tmpfs in the defined tests.
+      add_overlay: add an overlay test.
+      add_uds_tree: add a UDS test.
+      add_hostinet: add a hostinet test.
+      tags: starting test tags.
+    """
+
     _syscall_test(
         test = test,
         shard_count = shard_count,
@@ -111,6 +122,19 @@ def _syscall_test(
     # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
     tags += [full_platform, "file_" + file_access]
 
+    # Hash this target into one of 15 buckets. This can be used to
+    # randomly split targets between different workflows.
+    hash15 = hash(native.package_name() + name) % 15
+    tags.append("hash15:" + str(hash15))
+
+    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
+    # we figure out how to request ipv4 sockets on Guitar machines.
+    if network == "host":
+        tags.append("noguitar")
+
+    # Disable off-host networking.
+    tags.append("requires-net:loopback")
+
     # Add tag to prevent the tests from running in a Bazel sandbox.
     # TODO(b/120560048): Make the tests run without this tag.
     tags.append("no-sandbox")
@@ -118,8 +142,11 @@ def _syscall_test(
     # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
     # more stable.
     if platform == "kvm":
-        tags += ["manual"]
-        tags += ["requires-kvm"]
+        tags.append("manual")
+        tags.append("requires-kvm")
+
+        # TODO(b/112165693): Remove when tests pass reliably.
+        tags.append("notap")
 
     args = [
         # Arguments are passed directly to syscall_test_runner binary.
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 0a2d44a2c..85ec013d5 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -167,7 +167,7 @@ TEST(ChrootTest, DotDotFromOpenFD) {
 }
 
 // Test that link resolution in a chroot can escape the root by following an
-// open proc fd.
+// open proc fd. Regression test for b/32316719.
 TEST(ChrootTest, ProcFdLinkResolutionInChroot) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
 
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
index f41f99900..7cd6a75bd 100644
--- a/test/syscalls/linux/concurrency.cc
+++ b/test/syscalls/linux/concurrency.cc
@@ -46,7 +46,8 @@ TEST(ConcurrencyTest, SingleProcessMultithreaded) {
 }
 
 // Test that multiple threads in this process continue to execute in parallel,
-// even if an unrelated second process is spawned.
+// even if an unrelated second process is spawned. Regression test for
+// b/32119508.
 TEST(ConcurrencyTest, MultiProcessMultithreaded) {
   // In PID 1, start TIDs 1 and 2, and put both to sleep.
   //
diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc
index b790fe5be..2989379b7 100644
--- a/test/syscalls/linux/exec_proc_exe_workload.cc
+++ b/test/syscalls/linux/exec_proc_exe_workload.cc
@@ -21,6 +21,12 @@
 #include "test/util/posix_error.h"
 
 int main(int argc, char** argv, char** envp) {
+  // This is annoying. Because remote build systems may put these binaries
+  // in a content-addressable-store, you may wind up with /proc/self/exe
+  // pointing to some random path (but with a sensible argv[0]).
+  //
+  // Therefore, this test simply checks that the /proc/self/exe
+  // is absolute and *doesn't* match argv[1].
   std::string exe =
       gvisor::testing::ProcessExePath(getpid()).ValueOrDie();
   if (exe[0] != '/') {
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 906f3358d..ff8bdfeb0 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -271,7 +271,7 @@ TEST_F(ForkTest, Alarm) {
   EXPECT_EQ(0, alarmed);
 }
 
-// Child cannot affect parent private memory.
+// Child cannot affect parent private memory. Regression test for b/24137240.
 TEST_F(ForkTest, PrivateMemory) {
   std::atomic<uint32_t> local(0);
 
@@ -298,6 +298,9 @@ TEST_F(ForkTest, PrivateMemory) {
 }
 
 // Kernel-accessed buffers should remain coherent across COW.
+//
+// The buffer must be >= usermem.ZeroCopyMinBytes, as UnsafeAccess operates
+// differently. Regression test for b/33811887.
 TEST_F(ForkTest, COWSegment) {
   constexpr int kBufSize = 1024;
   char* read_buf = private_;
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 1c4d9f1c7..11fb1b457 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -1418,7 +1418,7 @@ TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) {
 //
 // On most platforms this is trivial, but when the file is mapped via the sentry
 // page cache (which does not yet support writing to shared mappings), a bug
-// caused reads to fail unnecessarily on such mappings.
+// caused reads to fail unnecessarily on such mappings. See b/28913513.
 TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
   uintptr_t addr;
   size_t len = strlen(kFileContents);
@@ -1435,7 +1435,7 @@ TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
 
 // Tests that EFAULT is returned when invoking a syscall that requires the OS to
 // read past end of file (resulting in a fault in sentry context in the gVisor
-// case).
+// case). See b/28913513.
 TEST_F(MMapFileTest, InternalSigBus) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
@@ -1578,7 +1578,7 @@ TEST_F(MMapFileTest, Bug38498194) {
 }
 
 // Tests that reading from a file to a memory mapping of the same file does not
-// deadlock.
+// deadlock. See b/34813270.
 TEST_F(MMapFileTest, SelfRead) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
@@ -1590,7 +1590,7 @@ TEST_F(MMapFileTest, SelfRead) {
 }
 
 // Tests that writing to a file from a memory mapping of the same file does not
-// deadlock.
+// deadlock. Regression test for b/34813270.
 TEST_F(MMapFileTest, SelfWrite) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index 431733dbe..902d0a0dc 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -132,6 +132,7 @@ TEST(CreateTest, CreateFailsOnDirWithoutWritePerms) {
 }
 
 // A file originally created RW, but opened RO can later be opened RW.
+// Regression test for b/65385065.
 TEST(CreateTest, OpenCreateROThenRW) {
   TempPath file(NewTempAbsPath());
 
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
index f7ea44054..5b0743fe9 100644
--- a/test/syscalls/linux/preadv.cc
+++ b/test/syscalls/linux/preadv.cc
@@ -37,6 +37,7 @@ namespace testing {
 
 namespace {
 
+// Stress copy-on-write. Attempts to reproduce b/38430174.
 TEST(PreadvTest, MMConcurrencyStress) {
   // Fill a one-page file with zeroes (the contents don't really matter).
   const auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 169b723eb..a23fdb58d 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1352,13 +1352,19 @@ TEST(ProcPidSymlink, SubprocessZombied) {
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
-  // 4.17 & gVisor: Syscall succeeds and returns 1
+  //
+  // ~4.3: Syscall fails with EACCES.
+  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
-  // 4.17 &  gVisor: Syscall succeeds and returns 1.
+  //
+  // ~4.3: Syscall fails with EACCES.
+  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 }
@@ -1431,8 +1437,12 @@ TEST(ProcPidFile, SubprocessRunning) {
 TEST(ProcPidFile, SubprocessZombie) {
   char buf[1];
 
-  // 4.17: Succeeds and returns 1
-  // gVisor: Succeeds and returns 0
+  // FIXME(gvisor.dev/issue/164): Loosen requirement due to inconsistent
+  // behavior on different kernels.
+  //
+  // ~4.3: Succeds and returns 0.
+  // 4.17: Succeeds and returns 1.
+  // gVisor: Succeeds and returns 0.
   EXPECT_THAT(ReadWhileZombied("auxv", buf, sizeof(buf)), SyscallSucceeds());
 
   EXPECT_THAT(ReadWhileZombied("cmdline", buf, sizeof(buf)),
@@ -1458,7 +1468,10 @@ TEST(ProcPidFile, SubprocessZombie) {
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
+  //
+  // ~4.3: Fails and returns EACCES.
   // gVisor & 4.17: Succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadWhileZombied("io", buf, sizeof(buf)),
   //          SyscallFailsWithErrno(EACCES));
 }
@@ -1467,9 +1480,12 @@ TEST(ProcPidFile, SubprocessZombie) {
 TEST(ProcPidFile, SubprocessExited) {
   char buf[1];
 
-  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels.
+  //
+  // ~4.3: Fails and returns ESRCH.
   // gVisor: Fails with ESRCH.
   // 4.17: Succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadWhileExited("auxv", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(ESRCH));
 
@@ -1641,7 +1657,7 @@ TEST(ProcTask, KilledThreadsDisappear) {
   EXPECT_NO_ERRNO(DirContainsExactly("/proc/self/task",
                                      TaskFiles(initial, {child1.Tid()})));
 
-  // Stat child1's task file.
+  // Stat child1's task file. Regression test for b/32097707.
   struct stat statbuf;
   const std::string child1_task_file =
       absl::StrCat("/proc/self/task/", child1.Tid());
@@ -1669,7 +1685,7 @@ TEST(ProcTask, KilledThreadsDisappear) {
   EXPECT_NO_ERRNO(EventuallyDirContainsExactly(
       "/proc/self/task", TaskFiles(initial, {child3.Tid(), child5.Tid()})));
 
-  // Stat child1's task file again.  This time it should fail.
+  // Stat child1's task file again.  This time it should fail. See b/32097707.
   EXPECT_THAT(stat(child1_task_file.c_str(), &statbuf),
               SyscallFailsWithErrno(ENOENT));
 
@@ -1824,7 +1840,7 @@ TEST(ProcSysVmOvercommitMemory, HasNumericValue) {
 }
 
 // Check that link for proc fd entries point the target node, not the
-// symlink itself.
+// symlink itself. Regression test for b/31155070.
 TEST(ProcTaskFd, FstatatFollowsSymlink) {
   const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   const FileDescriptor fd =
@@ -1883,6 +1899,20 @@ TEST(ProcMounts, IsSymlink) {
   EXPECT_EQ(link, "self/mounts");
 }
 
+TEST(ProcSelfMountinfo, RequiredFieldsArePresent) {
+  auto mountinfo =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mountinfo"));
+  EXPECT_THAT(
+      mountinfo,
+      AllOf(
+          // Root mount.
+          ContainsRegex(
+              R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / / (rw|ro).*- \S+ \S+ (rw|ro)\S*)"),
+          // Proc mount - always rw.
+          ContainsRegex(
+              R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / /proc rw.*- \S+ \S+ rw\S*)")));
+}
+
 // Check that /proc/self/mounts looks something like a real mounts file.
 TEST(ProcSelfMounts, RequiredFieldsArePresent) {
   auto mounts = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mounts"));
diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc
index 4069cbc7e..baaf9f757 100644
--- a/test/syscalls/linux/readv.cc
+++ b/test/syscalls/linux/readv.cc
@@ -254,7 +254,9 @@ TEST_F(ReadvTest, IovecOutsideTaskAddressRangeInNonemptyArray) {
 // This test depends on the maximum extent of a single readv() syscall, so
 // we can't tolerate interruption from saving.
 TEST(ReadvTestNoFixture, TruncatedAtMax_NoRandomSave) {
-  // Ensure that we won't be interrupted by ITIMER_PROF.
+  // Ensure that we won't be interrupted by ITIMER_PROF. This is particularly
+  // important in environments where automated profiling tools may start
+  // ITIMER_PROF automatically.
   struct itimerval itv = {};
   auto const cleanup_itimer =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_PROF, itv));
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
index 106c045e3..4bfb1ff56 100644
--- a/test/syscalls/linux/rseq.cc
+++ b/test/syscalls/linux/rseq.cc
@@ -36,7 +36,7 @@ namespace {
 // We must be very careful about how these tests are written. Each thread may
 // only have one struct rseq registration, which may be done automatically at
 // thread start (as of 2019-11-13, glibc does *not* support rseq and thus does
-// not do so).
+// not do so, but other libraries do).
 //
 // Testing of rseq is thus done primarily in a child process with no
 // registration. This means exec'ing a nostdlib binary, as rseq registration can
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index 424e2a67f..be2364fb8 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -146,7 +146,7 @@ TEST_F(SelectTest, IgnoreBitsAboveNfds) {
 
 // This test illustrates Linux's behavior of 'select' calls passing after
 // setrlimit RLIMIT_NOFILE is called. In particular, versions of sshd rely on
-// this behavior.
+// this behavior. See b/122318458.
 TEST_F(SelectTest, SetrlimitCallNOFILE) {
   fd_set read_set;
   FD_ZERO(&read_set);
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
index 7ba752599..c7fdbb924 100644
--- a/test/syscalls/linux/shm.cc
+++ b/test/syscalls/linux/shm.cc
@@ -473,7 +473,7 @@ TEST(ShmTest, PartialUnmap) {
 }
 
 // Check that sentry does not panic when asked for a zero-length private shm
-// segment.
+// segment. Regression test for b/110694797.
 TEST(ShmTest, GracefullyFailOnZeroLenSegmentCreation) {
   EXPECT_THAT(Shmget(IPC_PRIVATE, 0, 0), PosixErrorIs(EINVAL, _));
 }
diff --git a/test/syscalls/linux/sigprocmask.cc b/test/syscalls/linux/sigprocmask.cc
index 654c6a47f..a603fc1d1 100644
--- a/test/syscalls/linux/sigprocmask.cc
+++ b/test/syscalls/linux/sigprocmask.cc
@@ -237,7 +237,7 @@ TEST_F(SigProcMaskTest, SignalHandler) {
 }
 
 // Check that sigprocmask correctly handles aliasing of the set and oldset
-// pointers.
+// pointers. Regression test for b/30502311.
 TEST_F(SigProcMaskTest, AliasedSets) {
   sigset_t mask;
 
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index 276a94eb8..884319e1d 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -109,7 +109,7 @@ PosixErrorOr<std::vector<Mapping>> CreateFragmentedRegion(const int size,
 }
 
 // A contiguous iov that is heavily fragmented in FileMem can still be sent
-// successfully.
+// successfully. See b/115833655.
 TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -165,7 +165,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
 }
 
 // A contiguous iov that is heavily fragmented in FileMem can still be received
-// into successfully.
+// into successfully. Regression test for b/115833655.
 TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
index b249ff91f..03ee1250d 100644
--- a/test/syscalls/linux/symlink.cc
+++ b/test/syscalls/linux/symlink.cc
@@ -38,7 +38,7 @@ mode_t FilePermission(const std::string& path) {
 }
 
 // Test that name collisions are checked on the new link path, not the source
-// path.
+// path. Regression test for b/31782115.
 TEST(SymlinkTest, CanCreateSymlinkWithCachedSourceDirent) {
   const std::string srcname = NewTempAbsPath();
   const std::string newname = NewTempAbsPath();
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 8a8b68e75..c4591a3b9 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -244,7 +244,8 @@ TEST_P(TcpSocketTest, ZeroWriteAllowed) {
 }
 
 // Test that a non-blocking write with a buffer that is larger than the send
-// buffer size will not actually write the whole thing at once.
+// buffer size will not actually write the whole thing at once. Regression test
+// for b/64438887.
 TEST_P(TcpSocketTest, NonblockingLargeWrite) {
   // Set the FD to O_NONBLOCK.
   int opts;
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index c7eead17e..1ccb95733 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -62,6 +62,7 @@ TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) {
               ::testing::KilledBySignal(SIGSEGV), "");
 }
 
+// Mimics the gettimeofday(2) wrapper from the Go runtime <= 1.2.
 int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) {
   constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000;
   return reinterpret_cast<int (*)(struct timeval*, struct timezone*)>(
diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc
index bae377c69..8d8ebbb24 100644
--- a/test/syscalls/linux/tkill.cc
+++ b/test/syscalls/linux/tkill.cc
@@ -54,7 +54,7 @@ void SigHandler(int sig, siginfo_t* info, void* context) {
   TEST_CHECK(info->si_code == SI_TKILL);
 }
 
-// Test with a real signal.
+// Test with a real signal. Regression test for b/24790092.
 TEST(TkillTest, ValidTIDAndRealSignal) {
   struct sigaction sa;
   sa.sa_sigaction = SigHandler;
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index 35aacb172..9c10b6674 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -77,6 +77,7 @@ std::string NewTempAbsPath() {
 std::string NewTempRelPath() { return NextTempBasename(); }
 
 std::string GetAbsoluteTestTmpdir() {
+  // Note that TEST_TMPDIR is guaranteed to be set.
   char* env_tmpdir = getenv("TEST_TMPDIR");
   std::string tmp_dir =
       env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp";
diff --git a/tools/build/tags.bzl b/tools/build/tags.bzl
index e99c87f81..a6db44e47 100644
--- a/tools/build/tags.bzl
+++ b/tools/build/tags.bzl
@@ -33,4 +33,8 @@ go_suffixes = [
     "_wasm_unsafe",
     "_linux",
     "_linux_unsafe",
+    "_opts",
+    "_opts_unsafe",
+    "_impl",
+    "_impl_unsafe",
 ]
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 5d5fa134a..c03b557ae 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -73,6 +73,16 @@ def calculate_sets(srcs):
             result[target].append(file)
     return result
 
+def go_imports(name, src, out):
+    """Simplify a single Go source file by eliminating unused imports."""
+    native.genrule(
+        name = name,
+        srcs = [src],
+        outs = [out],
+        tools = ["@org_golang_x_tools//cmd/goimports:goimports"],
+        cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"),
+    )
+
 def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs):
     """Wraps the standard go_library and does stateification and marshalling.
 
@@ -107,10 +117,15 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         state_sets = calculate_sets(srcs)
         for (suffix, srcs) in state_sets.items():
             go_stateify(
-                name = name + suffix + "_state_autogen",
+                name = name + suffix + "_state_autogen_with_imports",
                 srcs = srcs,
                 imports = imports,
                 package = name,
+                out = name + suffix + "_state_autogen_with_imports.go",
+            )
+            go_imports(
+                name = name + suffix + "_state_autogen",
+                src = name + suffix + "_state_autogen_with_imports.go",
                 out = name + suffix + "_state_autogen.go",
             )
         all_srcs = all_srcs + [
-- 
cgit v1.2.3


From 0e96fcafd4404e1418c84b7830b9455867e174bb Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 6 Feb 2020 10:11:15 -0800
Subject: Fix test case on AMD.

When ignored, the trap should be executed which generates
a SIGSEGV as in the above case.

PiperOrigin-RevId: 293618489
---
 test/syscalls/linux/32bit.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 9883aef61..c47a05181 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -155,7 +155,7 @@ TEST(Syscall32Bit, Syscall) {
     case PlatformSupport::Ignored:
       // See above.
       EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
-                  ::testing::KilledBySignal(SIGILL), "");
+                  ::testing::KilledBySignal(SIGSEGV), "");
       break;
 
     case PlatformSupport::Allowed:
-- 
cgit v1.2.3


From 6de49546cb32806896cec27d3ab76e96323ecac1 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Fri, 7 Feb 2020 13:18:19 -0800
Subject: Refactor syscall tests

- Move shared helpers V4Multicast and V4Broadcast to socket_test_util
- Add unnamed namespace so socket_ipv4_tcp_unbound_external_networking_test.cc
  and socket_ipv4_udp_unbound_external_networking_test.cc can be compiled
  together
- Add test files to "exports_files" so they can be included by Fuchsia's syscall
  test setup

PiperOrigin-RevId: 293880429
---
 test/syscalls/linux/BUILD                           |  3 +++
 ...ket_ipv4_tcp_unbound_external_networking_test.cc |  3 +++
 test/syscalls/linux/socket_ipv4_udp_unbound.cc      | 21 ---------------------
 .../socket_ipv4_udp_unbound_external_networking.cc  | 20 --------------------
 ...ket_ipv4_udp_unbound_external_networking_test.cc |  3 +++
 test/syscalls/linux/socket_test_util.cc             | 18 ++++++++++++++++++
 test/syscalls/linux/socket_test_util.h              |  5 +++++
 7 files changed, 32 insertions(+), 41 deletions(-)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index f2e3c7072..12d389c3e 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -12,6 +12,9 @@ exports_files(
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_loopback.cc",
         "socket_ip_udp_loopback.cc",
+        "socket_ip_unbound.cc",
+        "socket_ipv4_tcp_unbound_external_networking_test.cc",
+        "socket_ipv4_udp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
         "udp_socket.cc",
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
index 3ac790873..797c4174e 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
@@ -22,6 +22,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketKind> GetSockets() {
   return ApplyVec<SocketKind>(
@@ -32,5 +33,7 @@ std::vector<SocketKind> GetSockets() {
 INSTANTIATE_TEST_SUITE_P(IPv4TCPUnboundSockets,
                          IPv4TCPUnboundExternalNetworkingSocketTest,
                          ::testing::ValuesIn(GetSockets()));
+
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index aa6fb4e3f..990ccf23c 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -30,27 +30,6 @@
 namespace gvisor {
 namespace testing {
 
-constexpr char kMulticastAddress[] = "224.0.2.1";
-constexpr char kBroadcastAddress[] = "255.255.255.255";
-
-TestAddress V4Multicast() {
-  TestAddress t("V4Multicast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      inet_addr(kMulticastAddress);
-  return t;
-}
-
-TestAddress V4Broadcast() {
-  TestAddress t("V4Broadcast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      inet_addr(kBroadcastAddress);
-  return t;
-}
-
 // Check that packets are not received without a group membership. Default send
 // interface configured by bind.
 TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNoGroup) {
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 98ae414f3..40e673625 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -41,26 +41,6 @@ TestAddress V4EmptyAddress() {
   return t;
 }
 
-constexpr char kMulticastAddress[] = "224.0.2.1";
-
-TestAddress V4Multicast() {
-  TestAddress t("V4Multicast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      inet_addr(kMulticastAddress);
-  return t;
-}
-
-TestAddress V4Broadcast() {
-  TestAddress t("V4Broadcast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      htonl(INADDR_BROADCAST);
-  return t;
-}
-
 void IPv4UDPUnboundExternalNetworkingSocketTest::SetUp() {
   got_if_infos_ = false;
 
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
index 8f47952b0..f6e64c157 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
@@ -22,6 +22,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketKind> GetSockets() {
   return ApplyVec<SocketKind>(
@@ -32,5 +33,7 @@ std::vector<SocketKind> GetSockets() {
 INSTANTIATE_TEST_SUITE_P(IPv4UDPUnboundSockets,
                          IPv4UDPUnboundExternalNetworkingSocketTest,
                          ::testing::ValuesIn(GetSockets()));
+
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index c0c5ab3fe..5d3a39868 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -805,6 +805,24 @@ TestAddress V4MappedLoopback() {
   return t;
 }
 
+TestAddress V4Multicast() {
+  TestAddress t("V4Multicast");
+  t.addr.ss_family = AF_INET;
+  t.addr_len = sizeof(sockaddr_in);
+  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+      inet_addr(kMulticastAddress);
+  return t;
+}
+
+TestAddress V4Broadcast() {
+  TestAddress t("V4Broadcast");
+  t.addr.ss_family = AF_INET;
+  t.addr_len = sizeof(sockaddr_in);
+  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+      htonl(INADDR_BROADCAST);
+  return t;
+}
+
 TestAddress V6Any() {
   TestAddress t("V6Any");
   t.addr.ss_family = AF_INET6;
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index bfaa6e397..734b48b96 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -484,10 +484,15 @@ struct TestAddress {
       : description(std::move(description)), addr(), addr_len() {}
 };
 
+constexpr char kMulticastAddress[] = "224.0.2.1";
+constexpr char kBroadcastAddress[] = "255.255.255.255";
+
 TestAddress V4Any();
+TestAddress V4Broadcast();
 TestAddress V4Loopback();
 TestAddress V4MappedAny();
 TestAddress V4MappedLoopback();
+TestAddress V4Multicast();
 TestAddress V6Any();
 TestAddress V6Loopback();
 
-- 
cgit v1.2.3


From 17b9f5e66238bde1e4ed3bd9e5fb67342c8b58ec Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 7 Feb 2020 14:46:24 -0800
Subject: Support listxattr and removexattr syscalls.

Note that these are only implemented for tmpfs, and other impls will still
return EOPNOTSUPP.

PiperOrigin-RevId: 293899385
---
 pkg/p9/client_file.go                      |  33 ++++
 pkg/p9/file.go                             |  16 ++
 pkg/p9/handlers.go                         |  33 ++++
 pkg/p9/messages.go                         | 199 +++++++++++++++----
 pkg/p9/p9.go                               |   4 +
 pkg/p9/version.go                          |   8 +-
 pkg/sentry/fs/copy_up.go                   |   2 +-
 pkg/sentry/fs/fsutil/inode.go              |  20 +-
 pkg/sentry/fs/gofer/context_file.go        |  14 ++
 pkg/sentry/fs/gofer/inode.go               |  13 +-
 pkg/sentry/fs/inode.go                     |  14 +-
 pkg/sentry/fs/inode_operations.go          |  13 +-
 pkg/sentry/fs/inode_overlay.go             |  18 +-
 pkg/sentry/fs/tmpfs/tmpfs.go               |   9 +-
 pkg/sentry/syscalls/linux/linux64_amd64.go |  27 ++-
 pkg/sentry/syscalls/linux/linux64_arm64.go |  37 ++--
 pkg/sentry/syscalls/linux/sys_xattr.go     | 200 +++++++++++++++++++-
 runsc/fsgofer/fsgofer.go                   |  16 +-
 test/syscalls/linux/BUILD                  |   1 +
 test/syscalls/linux/xattr.cc               | 294 ++++++++++++++++-------------
 20 files changed, 733 insertions(+), 238 deletions(-)

(limited to 'test/syscalls')

diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 0254e4ccc..2ee07b664 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -194,6 +194,39 @@ func (c *clientFile) SetXattr(name, value string, flags uint32) error {
 	return c.client.sendRecv(&Tsetxattr{FID: c.fid, Name: name, Value: value, Flags: flags}, &Rsetxattr{})
 }
 
+// ListXattr implements File.ListXattr.
+func (c *clientFile) ListXattr(size uint64) (map[string]struct{}, error) {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return nil, syscall.EBADF
+	}
+	if !versionSupportsListRemoveXattr(c.client.version) {
+		return nil, syscall.EOPNOTSUPP
+	}
+
+	rlistxattr := Rlistxattr{}
+	if err := c.client.sendRecv(&Tlistxattr{FID: c.fid, Size: size}, &rlistxattr); err != nil {
+		return nil, err
+	}
+
+	xattrs := make(map[string]struct{}, len(rlistxattr.Xattrs))
+	for _, x := range rlistxattr.Xattrs {
+		xattrs[x] = struct{}{}
+	}
+	return xattrs, nil
+}
+
+// RemoveXattr implements File.RemoveXattr.
+func (c *clientFile) RemoveXattr(name string) error {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return syscall.EBADF
+	}
+	if !versionSupportsListRemoveXattr(c.client.version) {
+		return syscall.EOPNOTSUPP
+	}
+
+	return c.client.sendRecv(&Tremovexattr{FID: c.fid, Name: name}, &Rremovexattr{})
+}
+
 // Allocate implements File.Allocate.
 func (c *clientFile) Allocate(mode AllocateMode, offset, length uint64) error {
 	if atomic.LoadUint32(&c.closed) != 0 {
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index 4607cfcdf..d4ffbc8e3 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -105,6 +105,22 @@ type File interface {
 	// TODO(b/127675828): Determine concurrency guarantees once implemented.
 	SetXattr(name, value string, flags uint32) error
 
+	// ListXattr lists the names of the extended attributes on this node.
+	//
+	// Size indicates the size of the buffer that has been allocated to hold the
+	// attribute list. If the list would be larger than size, implementations may
+	// return ERANGE to indicate that the buffer is too small, but they are also
+	// free to ignore the hint entirely (i.e. the value returned may be larger
+	// than size). All size checking is done independently at the syscall layer.
+	//
+	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	ListXattr(size uint64) (map[string]struct{}, error)
+
+	// RemoveXattr removes extended attributes on this node.
+	//
+	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	RemoveXattr(name string) error
+
 	// Allocate allows the caller to directly manipulate the allocated disk space
 	// for the file. See fallocate(2) for more details.
 	Allocate(mode AllocateMode, offset, length uint64) error
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 7d6653a07..2ac45eb80 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -941,6 +941,39 @@ func (t *Tsetxattr) handle(cs *connState) message {
 	return &Rsetxattr{}
 }
 
+// handle implements handler.handle.
+func (t *Tlistxattr) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	xattrs, err := ref.file.ListXattr(t.Size)
+	if err != nil {
+		return newErr(err)
+	}
+	xattrList := make([]string, 0, len(xattrs))
+	for x := range xattrs {
+		xattrList = append(xattrList, x)
+	}
+	return &Rlistxattr{Xattrs: xattrList}
+}
+
+// handle implements handler.handle.
+func (t *Tremovexattr) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	if err := ref.file.RemoveXattr(t.Name); err != nil {
+		return newErr(err)
+	}
+	return &Rremovexattr{}
+}
+
 // handle implements handler.handle.
 func (t *Treaddir) handle(cs *connState) message {
 	ref, ok := cs.LookupFID(t.Directory)
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index ceb723d86..b1cede5f5 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -174,11 +174,11 @@ type Rflush struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rflush) Decode(b *buffer) {
+func (*Rflush) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rflush) Encode(b *buffer) {
+func (*Rflush) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -188,7 +188,7 @@ func (*Rflush) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rflush) String() string {
-	return fmt.Sprintf("RFlush{}")
+	return "RFlush{}"
 }
 
 // Twalk is a walk request.
@@ -300,11 +300,11 @@ type Rclunk struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rclunk) Decode(b *buffer) {
+func (*Rclunk) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rclunk) Encode(b *buffer) {
+func (*Rclunk) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -314,7 +314,7 @@ func (*Rclunk) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rclunk) String() string {
-	return fmt.Sprintf("Rclunk{}")
+	return "Rclunk{}"
 }
 
 // Tremove is a remove request.
@@ -350,11 +350,11 @@ type Rremove struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rremove) Decode(b *buffer) {
+func (*Rremove) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rremove) Encode(b *buffer) {
+func (*Rremove) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -364,7 +364,7 @@ func (*Rremove) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rremove) String() string {
-	return fmt.Sprintf("Rremove{}")
+	return "Rremove{}"
 }
 
 // Rlerror is an error response.
@@ -745,16 +745,16 @@ func (*Rlink) Type() MsgType {
 }
 
 // Decode implements encoder.Decode.
-func (*Rlink) Decode(b *buffer) {
+func (*Rlink) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rlink) Encode(b *buffer) {
+func (*Rlink) Encode(*buffer) {
 }
 
 // String implements fmt.Stringer.
 func (r *Rlink) String() string {
-	return fmt.Sprintf("Rlink{}")
+	return "Rlink{}"
 }
 
 // Trenameat is a rename request.
@@ -803,11 +803,11 @@ type Rrenameat struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rrenameat) Decode(b *buffer) {
+func (*Rrenameat) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rrenameat) Encode(b *buffer) {
+func (*Rrenameat) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -817,7 +817,7 @@ func (*Rrenameat) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rrenameat) String() string {
-	return fmt.Sprintf("Rrenameat{}")
+	return "Rrenameat{}"
 }
 
 // Tunlinkat is an unlink request.
@@ -861,11 +861,11 @@ type Runlinkat struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Runlinkat) Decode(b *buffer) {
+func (*Runlinkat) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Runlinkat) Encode(b *buffer) {
+func (*Runlinkat) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -875,7 +875,7 @@ func (*Runlinkat) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Runlinkat) String() string {
-	return fmt.Sprintf("Runlinkat{}")
+	return "Runlinkat{}"
 }
 
 // Trename is a rename request.
@@ -922,11 +922,11 @@ type Rrename struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rrename) Decode(b *buffer) {
+func (*Rrename) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rrename) Encode(b *buffer) {
+func (*Rrename) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -936,7 +936,7 @@ func (*Rrename) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rrename) String() string {
-	return fmt.Sprintf("Rrename{}")
+	return "Rrename{}"
 }
 
 // Treadlink is a readlink request.
@@ -1409,11 +1409,11 @@ type Rsetattr struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rsetattr) Decode(b *buffer) {
+func (*Rsetattr) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rsetattr) Encode(b *buffer) {
+func (*Rsetattr) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1423,7 +1423,7 @@ func (*Rsetattr) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rsetattr) String() string {
-	return fmt.Sprintf("Rsetattr{}")
+	return "Rsetattr{}"
 }
 
 // Tallocate is an allocate request. This is an extension to 9P protocol, not
@@ -1466,11 +1466,11 @@ type Rallocate struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rallocate) Decode(b *buffer) {
+func (*Rallocate) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rallocate) Encode(b *buffer) {
+func (*Rallocate) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1480,7 +1480,71 @@ func (*Rallocate) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rallocate) String() string {
-	return fmt.Sprintf("Rallocate{}")
+	return "Rallocate{}"
+}
+
+// Tlistxattr is a listxattr request.
+type Tlistxattr struct {
+	// FID refers to the file on which to list xattrs.
+	FID FID
+
+	// Size is the buffer size for the xattr list.
+	Size uint64
+}
+
+// Decode implements encoder.Decode.
+func (t *Tlistxattr) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Size = b.Read64()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tlistxattr) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	b.Write64(t.Size)
+}
+
+// Type implements message.Type.
+func (*Tlistxattr) Type() MsgType {
+	return MsgTlistxattr
+}
+
+// String implements fmt.Stringer.
+func (t *Tlistxattr) String() string {
+	return fmt.Sprintf("Tlistxattr{FID: %d, Size: %d}", t.FID, t.Size)
+}
+
+// Rlistxattr is a listxattr response.
+type Rlistxattr struct {
+	// Xattrs is a list of extended attribute names.
+	Xattrs []string
+}
+
+// Decode implements encoder.Decode.
+func (r *Rlistxattr) Decode(b *buffer) {
+	n := b.Read16()
+	r.Xattrs = r.Xattrs[:0]
+	for i := 0; i < int(n); i++ {
+		r.Xattrs = append(r.Xattrs, b.ReadString())
+	}
+}
+
+// Encode implements encoder.Encode.
+func (r *Rlistxattr) Encode(b *buffer) {
+	b.Write16(uint16(len(r.Xattrs)))
+	for _, x := range r.Xattrs {
+		b.WriteString(x)
+	}
+}
+
+// Type implements message.Type.
+func (*Rlistxattr) Type() MsgType {
+	return MsgRlistxattr
+}
+
+// String implements fmt.Stringer.
+func (r *Rlistxattr) String() string {
+	return fmt.Sprintf("Rlistxattr{Xattrs: %v}", r.Xattrs)
 }
 
 // Txattrwalk walks extended attributes.
@@ -1594,11 +1658,11 @@ type Rxattrcreate struct {
 }
 
 // Decode implements encoder.Decode.
-func (r *Rxattrcreate) Decode(b *buffer) {
+func (r *Rxattrcreate) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (r *Rxattrcreate) Encode(b *buffer) {
+func (r *Rxattrcreate) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1608,7 +1672,7 @@ func (*Rxattrcreate) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rxattrcreate) String() string {
-	return fmt.Sprintf("Rxattrcreate{}")
+	return "Rxattrcreate{}"
 }
 
 // Tgetxattr is a getxattr request.
@@ -1719,11 +1783,11 @@ type Rsetxattr struct {
 }
 
 // Decode implements encoder.Decode.
-func (r *Rsetxattr) Decode(b *buffer) {
+func (r *Rsetxattr) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (r *Rsetxattr) Encode(b *buffer) {
+func (r *Rsetxattr) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1733,7 +1797,60 @@ func (*Rsetxattr) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rsetxattr) String() string {
-	return fmt.Sprintf("Rsetxattr{}")
+	return "Rsetxattr{}"
+}
+
+// Tremovexattr is a removexattr request.
+type Tremovexattr struct {
+	// FID refers to the file on which to set xattrs.
+	FID FID
+
+	// Name is the attribute name.
+	Name string
+}
+
+// Decode implements encoder.Decode.
+func (t *Tremovexattr) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Name = b.ReadString()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tremovexattr) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	b.WriteString(t.Name)
+}
+
+// Type implements message.Type.
+func (*Tremovexattr) Type() MsgType {
+	return MsgTremovexattr
+}
+
+// String implements fmt.Stringer.
+func (t *Tremovexattr) String() string {
+	return fmt.Sprintf("Tremovexattr{FID: %d, Name: %s}", t.FID, t.Name)
+}
+
+// Rremovexattr is a removexattr response.
+type Rremovexattr struct {
+}
+
+// Decode implements encoder.Decode.
+func (r *Rremovexattr) Decode(*buffer) {
+}
+
+// Encode implements encoder.Encode.
+func (r *Rremovexattr) Encode(*buffer) {
+}
+
+// Type implements message.Type.
+func (*Rremovexattr) Type() MsgType {
+	return MsgRremovexattr
+}
+
+// String implements fmt.Stringer.
+func (r *Rremovexattr) String() string {
+	return "Rremovexattr{}"
 }
 
 // Treaddir is a readdir request.
@@ -1880,11 +1997,11 @@ type Rfsync struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rfsync) Decode(b *buffer) {
+func (*Rfsync) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rfsync) Encode(b *buffer) {
+func (*Rfsync) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1894,7 +2011,7 @@ func (*Rfsync) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rfsync) String() string {
-	return fmt.Sprintf("Rfsync{}")
+	return "Rfsync{}"
 }
 
 // Tstatfs is a stat request.
@@ -1980,11 +2097,11 @@ type Rflushf struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rflushf) Decode(b *buffer) {
+func (*Rflushf) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rflushf) Encode(b *buffer) {
+func (*Rflushf) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1994,7 +2111,7 @@ func (*Rflushf) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (*Rflushf) String() string {
-	return fmt.Sprintf("Rflushf{}")
+	return "Rflushf{}"
 }
 
 // Twalkgetattr is a walk request.
@@ -2484,6 +2601,8 @@ func init() {
 	msgRegistry.register(MsgRgetattr, func() message { return &Rgetattr{} })
 	msgRegistry.register(MsgTsetattr, func() message { return &Tsetattr{} })
 	msgRegistry.register(MsgRsetattr, func() message { return &Rsetattr{} })
+	msgRegistry.register(MsgTlistxattr, func() message { return &Tlistxattr{} })
+	msgRegistry.register(MsgRlistxattr, func() message { return &Rlistxattr{} })
 	msgRegistry.register(MsgTxattrwalk, func() message { return &Txattrwalk{} })
 	msgRegistry.register(MsgRxattrwalk, func() message { return &Rxattrwalk{} })
 	msgRegistry.register(MsgTxattrcreate, func() message { return &Txattrcreate{} })
@@ -2492,6 +2611,8 @@ func init() {
 	msgRegistry.register(MsgRgetxattr, func() message { return &Rgetxattr{} })
 	msgRegistry.register(MsgTsetxattr, func() message { return &Tsetxattr{} })
 	msgRegistry.register(MsgRsetxattr, func() message { return &Rsetxattr{} })
+	msgRegistry.register(MsgTremovexattr, func() message { return &Tremovexattr{} })
+	msgRegistry.register(MsgRremovexattr, func() message { return &Rremovexattr{} })
 	msgRegistry.register(MsgTreaddir, func() message { return &Treaddir{} })
 	msgRegistry.register(MsgRreaddir, func() message { return &Rreaddir{} })
 	msgRegistry.register(MsgTfsync, func() message { return &Tfsync{} })
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 5ab00d625..20ab31f7a 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -335,6 +335,8 @@ const (
 	MsgRgetattr             = 25
 	MsgTsetattr             = 26
 	MsgRsetattr             = 27
+	MsgTlistxattr           = 28
+	MsgRlistxattr           = 29
 	MsgTxattrwalk           = 30
 	MsgRxattrwalk           = 31
 	MsgTxattrcreate         = 32
@@ -343,6 +345,8 @@ const (
 	MsgRgetxattr            = 35
 	MsgTsetxattr            = 36
 	MsgRsetxattr            = 37
+	MsgTremovexattr         = 38
+	MsgRremovexattr         = 39
 	MsgTreaddir             = 40
 	MsgRreaddir             = 41
 	MsgTfsync               = 50
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index 34a15eb55..09cde9f5a 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -26,7 +26,7 @@ const (
 	//
 	// Clients are expected to start requesting this version number and
 	// to continuously decrement it until a Tversion request succeeds.
-	highestSupportedVersion uint32 = 10
+	highestSupportedVersion uint32 = 11
 
 	// lowestSupportedVersion is the lowest supported version X in a
 	// version string of the format 9P2000.L.Google.X.
@@ -167,3 +167,9 @@ func VersionSupportsOpenTruncateFlag(v uint32) bool {
 func versionSupportsGetSetXattr(v uint32) bool {
 	return v >= 10
 }
+
+// versionSupportsListRemoveXattr returns true if version v supports
+// the Tlistxattr and Tremovexattr messages.
+func versionSupportsListRemoveXattr(v uint32) bool {
+	return v >= 11
+}
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index f6c79e51b..b060a12ff 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -401,7 +401,7 @@ func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error
 	if err != nil {
 		return err
 	}
-	lowerXattr, err := lower.ListXattr(ctx)
+	lowerXattr, err := lower.ListXattr(ctx, linux.XATTR_SIZE_MAX)
 	if err != nil && err != syserror.EOPNOTSUPP {
 		return err
 	}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 252830572..daecc4ffe 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -247,7 +247,7 @@ func (i *InodeSimpleExtendedAttributes) SetXattr(_ context.Context, _ *fs.Inode,
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
+func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) {
 	i.mu.RLock()
 	names := make(map[string]struct{}, len(i.xattrs))
 	for name := range i.xattrs {
@@ -257,6 +257,17 @@ func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (m
 	return names, nil
 }
 
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Inode, name string) error {
+	i.mu.RLock()
+	defer i.mu.RUnlock()
+	if _, ok := i.xattrs[name]; ok {
+		delete(i.xattrs, name)
+		return nil
+	}
+	return syserror.ENOATTR
+}
+
 // staticFile is a file with static contents. It is returned by
 // InodeStaticFileGetter.GetFile.
 //
@@ -460,10 +471,15 @@ func (InodeNoExtendedAttributes) SetXattr(context.Context, *fs.Inode, string, st
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
+func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) {
 	return nil, syserror.EOPNOTSUPP
 }
 
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (InodeNoExtendedAttributes) RemoveXattr(context.Context, *fs.Inode, string) error {
+	return syserror.EOPNOTSUPP
+}
+
 // InodeNoopRelease implements fs.InodeOperations.Release as a noop.
 type InodeNoopRelease struct{}
 
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index 3da818aed..125907d70 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -73,6 +73,20 @@ func (c *contextFile) setXattr(ctx context.Context, name, value string, flags ui
 	return err
 }
 
+func (c *contextFile) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
+	ctx.UninterruptibleSleepStart(false)
+	xattrs, err := c.file.ListXattr(size)
+	ctx.UninterruptibleSleepFinish(false)
+	return xattrs, err
+}
+
+func (c *contextFile) removeXattr(ctx context.Context, name string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := c.file.RemoveXattr(name)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (c *contextFile) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
 	ctx.UninterruptibleSleepStart(false)
 	err := c.file.Allocate(mode, offset, length)
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index ac28174d2..1c934981b 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -604,18 +604,23 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length
 }
 
 // GetXattr implements fs.InodeOperations.GetXattr.
-func (i *inodeOperations) GetXattr(ctx context.Context, inode *fs.Inode, name string, size uint64) (string, error) {
+func (i *inodeOperations) GetXattr(ctx context.Context, _ *fs.Inode, name string, size uint64) (string, error) {
 	return i.fileState.file.getXattr(ctx, name, size)
 }
 
 // SetXattr implements fs.InodeOperations.SetXattr.
-func (i *inodeOperations) SetXattr(ctx context.Context, inode *fs.Inode, name string, value string, flags uint32) error {
+func (i *inodeOperations) SetXattr(ctx context.Context, _ *fs.Inode, name string, value string, flags uint32) error {
 	return i.fileState.file.setXattr(ctx, name, value, flags)
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (i *inodeOperations) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
-	return nil, syscall.EOPNOTSUPP
+func (i *inodeOperations) ListXattr(ctx context.Context, _ *fs.Inode, size uint64) (map[string]struct{}, error) {
+	return i.fileState.file.listXattr(ctx, size)
+}
+
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (i *inodeOperations) RemoveXattr(ctx context.Context, _ *fs.Inode, name string) error {
+	return i.fileState.file.removeXattr(ctx, name)
 }
 
 // Allocate implements fs.InodeOperations.Allocate.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b66c091ab..55fb71c16 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -278,11 +278,19 @@ func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, fla
 }
 
 // ListXattr calls i.InodeOperations.ListXattr with i as the Inode.
-func (i *Inode) ListXattr(ctx context.Context) (map[string]struct{}, error) {
+func (i *Inode) ListXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
 	if i.overlay != nil {
-		return overlayListXattr(ctx, i.overlay)
+		return overlayListXattr(ctx, i.overlay, size)
 	}
-	return i.InodeOperations.ListXattr(ctx, i)
+	return i.InodeOperations.ListXattr(ctx, i, size)
+}
+
+// RemoveXattr calls i.InodeOperations.RemoveXattr with i as the Inode.
+func (i *Inode) RemoveXattr(ctx context.Context, d *Dirent, name string) error {
+	if i.overlay != nil {
+		return overlayRemoveXattr(ctx, i.overlay, d, name)
+	}
+	return i.InodeOperations.RemoveXattr(ctx, i, name)
 }
 
 // CheckPermission will check if the caller may access this file in the
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 70f2eae96..2bbfb72ef 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -190,7 +190,18 @@ type InodeOperations interface {
 	// ListXattr returns the set of all extended attributes names that
 	// have values. Inodes that do not support extended attributes return
 	// EOPNOTSUPP.
-	ListXattr(ctx context.Context, inode *Inode) (map[string]struct{}, error)
+	//
+	// If this is called through the listxattr(2) syscall, size indicates the
+	// size of the buffer that the application has allocated to hold the
+	// attribute list. If the list would be larger than size, implementations may
+	// return ERANGE to indicate that the buffer is too small, but they are also
+	// free to ignore the hint entirely. All size checking is done independently
+	// at the syscall layer.
+	ListXattr(ctx context.Context, inode *Inode, size uint64) (map[string]struct{}, error)
+
+	// RemoveXattr removes an extended attribute specified by name. Inodes that
+	// do not support extended attributes return EOPNOTSUPP.
+	RemoveXattr(ctx context.Context, inode *Inode, name string) error
 
 	// Check determines whether an Inode can be accessed with the
 	// requested permission mask using the context (which gives access
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 4729b4aac..5ada33a32 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -564,15 +564,15 @@ func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, valu
 	return o.upper.SetXattr(ctx, d, name, value, flags)
 }
 
-func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}, error) {
+func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[string]struct{}, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
 	var names map[string]struct{}
 	var err error
 	if o.upper != nil {
-		names, err = o.upper.ListXattr(ctx)
+		names, err = o.upper.ListXattr(ctx, size)
 	} else {
-		names, err = o.lower.ListXattr(ctx)
+		names, err = o.lower.ListXattr(ctx, size)
 	}
 	for name := range names {
 		// Same as overlayGetXattr, we shouldn't forward along
@@ -584,6 +584,18 @@ func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}
 	return names, err
 }
 
+func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error {
+	// Don't allow changes to overlay xattrs through a removexattr syscall.
+	if strings.HasPrefix(XattrOverlayPrefix, name) {
+		return syserror.EPERM
+	}
+
+	if err := copyUp(ctx, d); err != nil {
+		return err
+	}
+	return o.upper.RemoveXattr(ctx, d, name)
+}
+
 func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error {
 	o.copyMu.RLock()
 	// Hot path. Avoid defers.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index c00cef0a5..3c2b583ae 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -159,8 +159,13 @@ func (d *Dir) SetXattr(ctx context.Context, i *fs.Inode, name, value string, fla
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode) (map[string]struct{}, error) {
-	return d.ramfsDir.ListXattr(ctx, i)
+func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode, size uint64) (map[string]struct{}, error) {
+	return d.ramfsDir.ListXattr(ctx, i, size)
+}
+
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (d *Dir) RemoveXattr(ctx context.Context, i *fs.Inode, name string) error {
+	return d.ramfsDir.RemoveXattr(ctx, i, name)
 }
 
 // Lookup implements fs.InodeOperations.Lookup.
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 588f8b087..79066ad2a 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,21 +228,18 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		// TODO(b/148303075): Enable set/getxattr (in their various
-		// forms) once we also have list and removexattr. The JVM
-		// assumes that if get/set exist, then list and remove do too.
-		188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
 		200: syscalls.Supported("tkill", Tkill),
 		201: syscalls.Supported("time", Time),
 		202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 06e5ee401..7421619de 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -36,26 +36,23 @@ var ARM64 = &kernel.SyscallTable{
 	},
 	AuditNumber: linux.AUDIT_ARCH_AARCH64,
 	Table: map[uintptr]kernel.Syscall{
-		0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		// TODO(b/148303075): Enable set/getxattr (in their various
-		// forms) once we also have list and removexattr. The JVM
-		// assumes that if get/set exist, then list and remove do too.
-		5:   syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		6:   syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		7:   syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		8:   syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		13:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		0:   syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		1:   syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		11:  syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		12:  syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		13:  syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		14:  syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		15:  syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		16:  syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
 		17:  syscalls.Supported("getcwd", Getcwd),
 		18:  syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
 		19:  syscalls.Supported("eventfd2", Eventfd2),
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index efb95555c..342337726 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -72,7 +72,7 @@ func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink
 	}
 
 	valueLen := 0
-	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
 		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 			return syserror.ENOTDIR
 		}
@@ -172,7 +172,7 @@ func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink
 		return 0, nil, err
 	}
 
-	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
 		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 			return syserror.ENOTDIR
 		}
@@ -187,12 +187,12 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, si
 		return syserror.EINVAL
 	}
 
-	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
 		return err
 	}
 
-	name, err := copyInXattrName(t, nameAddr)
-	if err != nil {
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
 		return err
 	}
 
@@ -226,12 +226,18 @@ func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
 	return name, nil
 }
 
+// Restrict xattrs to regular files and directories.
+//
+// TODO(b/148380782): In Linux, this restriction technically only applies to
+// xattrs in the "user.*" namespace. Make file type checks specific to the
+// namespace once we allow other xattr prefixes.
+func xattrFileTypeOk(i *fs.Inode) bool {
+	return fs.IsRegular(i.StableAttr) || fs.IsDir(i.StableAttr)
+}
+
 func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error {
 	// Restrict xattrs to regular files and directories.
-	//
-	// In Linux, this restriction technically only applies to xattrs in the
-	// "user.*" namespace, but we don't allow any other xattr prefixes anyway.
-	if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) {
+	if !xattrFileTypeOk(i) {
 		if perms.Write {
 			return syserror.EPERM
 		}
@@ -240,3 +246,179 @@ func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error
 
 	return i.CheckPermission(t, perms)
 }
+
+// ListXattr implements linux syscall listxattr(2).
+func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listXattrFromPath(t, args, true)
+}
+
+// LListXattr implements linux syscall llistxattr(2).
+func LListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listXattrFromPath(t, args, false)
+}
+
+// FListXattr implements linux syscall flistxattr(2).
+func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	listAddr := args[1].Pointer()
+	size := uint64(args[2].SizeT())
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	n, err := listXattr(t, f.Dirent, listAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+func listXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	listAddr := args[1].Pointer()
+	size := uint64(args[2].SizeT())
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n := 0
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		n, err = listXattr(t, d, listAddr, size)
+		return err
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+func listXattr(t *kernel.Task, d *fs.Dirent, addr usermem.Addr, size uint64) (int, error) {
+	if !xattrFileTypeOk(d.Inode) {
+		return 0, nil
+	}
+
+	// If listxattr(2) is called with size 0, the buffer size needed to contain
+	// the xattr list will be returned successfully even if it is nonzero. In
+	// that case, we need to retrieve the entire list so we can compute and
+	// return the correct size.
+	requestedSize := size
+	if size == 0 || size > linux.XATTR_SIZE_MAX {
+		requestedSize = linux.XATTR_SIZE_MAX
+	}
+	xattrs, err := d.Inode.ListXattr(t, requestedSize)
+	if err != nil {
+		return 0, err
+	}
+
+	// TODO(b/148380782): support namespaces other than "user".
+	for x := range xattrs {
+		if !strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+			delete(xattrs, x)
+		}
+	}
+
+	listSize := xattrListSize(xattrs)
+	if listSize > linux.XATTR_SIZE_MAX {
+		return 0, syserror.E2BIG
+	}
+	if uint64(listSize) > requestedSize {
+		return 0, syserror.ERANGE
+	}
+
+	// Don't copy out the attributes if size is 0.
+	if size == 0 {
+		return listSize, nil
+	}
+
+	buf := make([]byte, 0, listSize)
+	for x := range xattrs {
+		buf = append(buf, []byte(x)...)
+		buf = append(buf, 0)
+	}
+	if _, err := t.CopyOutBytes(addr, buf); err != nil {
+		return 0, err
+	}
+
+	return len(buf), nil
+}
+
+func xattrListSize(xattrs map[string]struct{}) int {
+	size := 0
+	for x := range xattrs {
+		size += len(x) + 1
+	}
+	return size
+}
+
+// RemoveXattr implements linux syscall removexattr(2).
+func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return removeXattrFromPath(t, args, true)
+}
+
+// LRemoveXattr implements linux syscall lremovexattr(2).
+func LRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return removeXattrFromPath(t, args, false)
+}
+
+// FRemoveXattr implements linux syscall fremovexattr(2).
+func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	return 0, nil, removeXattr(t, f.Dirent, nameAddr)
+}
+
+func removeXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		return removeXattr(t, d, nameAddr)
+	})
+}
+
+// removeXattr implements removexattr(2) from the given *fs.Dirent.
+func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+		return err
+	}
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+
+	return d.Inode.RemoveXattr(t, d, name)
+}
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 4d84ad999..cadd83273 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -768,12 +768,22 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 }
 
 // TODO(b/127675828): support getxattr.
-func (l *localFile) GetXattr(name string, size uint64) (string, error) {
+func (*localFile) GetXattr(string, uint64) (string, error) {
 	return "", syscall.EOPNOTSUPP
 }
 
 // TODO(b/127675828): support setxattr.
-func (l *localFile) SetXattr(name, value string, flags uint32) error {
+func (*localFile) SetXattr(string, string, uint32) error {
+	return syscall.EOPNOTSUPP
+}
+
+// TODO(b/148303075): support listxattr.
+func (*localFile) ListXattr(uint64) (map[string]struct{}, error) {
+	return nil, syscall.EOPNOTSUPP
+}
+
+// TODO(b/148303075): support removexattr.
+func (*localFile) RemoveXattr(string) error {
 	return syscall.EOPNOTSUPP
 }
 
@@ -790,7 +800,7 @@ func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error
 }
 
 // Rename implements p9.File; this should never be called.
-func (l *localFile) Rename(p9.File, string) error {
+func (*localFile) Rename(p9.File, string) error {
 	panic("rename called directly")
 }
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 12d389c3e..ca1af209a 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3782,6 +3782,7 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         gtest,
         "//test/util:posix_error",
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 85eb31847..8b00ef44c 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -24,6 +24,7 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include "absl/container/flat_hash_set.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
@@ -38,36 +39,36 @@ namespace {
 
 class XattrTest : public FileTest {};
 
-TEST_F(XattrTest, XattrNullName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+TEST_F(XattrTest, XattrNonexistentFile) {
+  const char* path = "/does/not/exist";
+  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+              SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(listxattr(path, nullptr, 0), SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(ENOENT));
+}
 
+TEST_F(XattrTest, XattrNullName) {
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EFAULT));
   EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
               SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(EFAULT));
 }
 
 TEST_F(XattrTest, XattrEmptyName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(ERANGE));
   EXPECT_THAT(getxattr(path, "", nullptr, 0), SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(removexattr(path, ""), SyscallFailsWithErrno(ERANGE));
 }
 
 TEST_F(XattrTest, XattrLargeName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
@@ -86,28 +87,23 @@ TEST_F(XattrTest, XattrLargeName) {
               SyscallFailsWithErrno(ERANGE));
   EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
               SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(removexattr(path, name.c_str()), SyscallFailsWithErrno(ERANGE));
 }
 
 TEST_F(XattrTest, XattrInvalidPrefix) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name(XATTR_NAME_MAX, 'a');
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EOPNOTSUPP));
   EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
               SyscallFailsWithErrno(EOPNOTSUPP));
+  EXPECT_THAT(removexattr(path, name.c_str()),
+              SyscallFailsWithErrno(EOPNOTSUPP));
 }
 
 // Do not allow save/restore cycles after making the test file read-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -124,19 +120,21 @@ TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
 
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
               SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EACCES));
 
   char buf = '-';
   EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
 }
 
 // Do not allow save/restore cycles after making the test file write-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -152,6 +150,14 @@ TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
+
+  // listxattr will succeed even without read permissions.
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrTrustedWithNonadmin) {
@@ -163,64 +169,66 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) {
   const char name[] = "trusted.abc";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, XattrOnDirectory) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const char name[] = "user.test";
-  EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(dir.path().c_str(), name, nullptr, 0, /*flags=*/0),
               SyscallSucceeds());
-  EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0),
+  EXPECT_THAT(getxattr(dir.path().c_str(), name, nullptr, 0),
               SyscallSucceedsWithValue(0));
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(dir.path().c_str(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(dir.path().c_str(), name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrOnSymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
   const char name[] = "user.test";
-  EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(link.path().c_str(), name, nullptr, 0, /*flags=*/0),
               SyscallSucceeds());
-  EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0),
+  EXPECT_THAT(getxattr(link.path().c_str(), name, nullptr, 0),
               SyscallSucceedsWithValue(0));
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(link.path().c_str(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(link.path().c_str(), name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char name[] = "user.test";
 
   char char_device[] = "/dev/zero";
-  EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(char_device, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(getxattr(char_device, name, NULL, 0),
+  EXPECT_THAT(getxattr(char_device, name, nullptr, 0),
               SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(listxattr(char_device, nullptr, 0), SyscallSucceedsWithValue(0));
 
   // Use tmpfs, where creation of named pipes is supported.
   const std::string fifo = NewTempAbsPathInDir("/dev/shm");
   const char* path = fifo.c_str();
   EXPECT_THAT(mknod(path, S_IFIFO | S_IRUSR | S_IWUSR, 0), SyscallSucceeds());
-  EXPECT_THAT(setxattr(path, name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(getxattr(path, name, NULL, 0), SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(listxattr(path, nullptr, 0), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
 }
 
 TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -236,10 +244,6 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, SetxattrZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -252,10 +256,6 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
 
@@ -271,10 +271,6 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
@@ -284,10 +280,6 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -296,10 +288,6 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
@@ -316,10 +304,6 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -335,10 +319,6 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithLarger) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -353,10 +333,6 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
 }
 
 TEST_F(XattrTest, SetxattrCreateFlag) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -368,10 +344,6 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceFlag) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
@@ -384,10 +356,6 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
 }
 
 TEST_F(XattrTest, SetxattrInvalidFlags) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
@@ -395,10 +363,6 @@ TEST_F(XattrTest, SetxattrInvalidFlags) {
 }
 
 TEST_F(XattrTest, Getxattr) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -411,10 +375,6 @@ TEST_F(XattrTest, Getxattr) {
 }
 
 TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -427,10 +387,6 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -446,10 +402,6 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -463,10 +415,6 @@ TEST_F(XattrTest, GetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrSizeTooLarge) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -483,10 +431,6 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, GetxattrNullValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -498,10 +442,6 @@ TEST_F(XattrTest, GetxattrNullValue) {
 }
 
 TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -518,35 +458,109 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrNonexistentName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, Listxattr) {
+  const char* path = test_file_name_.c_str();
+  const std::string name = "user.test";
+  const std::string name2 = "user.test2";
+  const std::string name3 = "user.test3";
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name2.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name3.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
 
+  std::vector<char> list(name.size() + 1 + name2.size() + 1 + name3.size() + 1);
+  char* buf = list.data();
+  EXPECT_THAT(listxattr(path, buf, XATTR_SIZE_MAX),
+              SyscallSucceedsWithValue(list.size()));
+
+  absl::flat_hash_set<std::string> got = {};
+  for (char* p = buf; p < buf + list.size(); p += strlen(p) + 1) {
+    got.insert(std::string{p});
+  }
+
+  absl::flat_hash_set<std::string> expected = {name, name2, name3};
+  EXPECT_EQ(got, expected);
+}
+
+TEST_F(XattrTest, ListxattrNoXattrs) {
+  const char* path = test_file_name_.c_str();
+
+  std::vector<char> list, expected;
+  EXPECT_THAT(listxattr(path, list.data(), sizeof(list)),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(list, expected);
+
+  // Listxattr should succeed if there are no attributes, even if the buffer
+  // passed in is a nullptr.
+  EXPECT_THAT(listxattr(path, nullptr, sizeof(list)),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, ListxattrNullBuffer) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(listxattr(path, nullptr, sizeof(name)),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, ListxattrSizeTooSmall) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+  char list[sizeof(name) - 1];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, ListxattrZeroSize) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+  EXPECT_THAT(listxattr(path, nullptr, 0),
+              SyscallSucceedsWithValue(sizeof(name)));
+}
+
+TEST_F(XattrTest, RemoveXattr) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+  EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
-TEST_F(XattrTest, LGetSetxattrOnSymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+TEST_F(XattrTest, RemoveXattrNonexistentName) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(ENODATA));
+}
 
+TEST_F(XattrTest, LXattrOnSymlink) {
+  const char name[] = "user.test";
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
 
-  EXPECT_THAT(lsetxattr(link.path().c_str(), nullptr, nullptr, 0, 0),
+  EXPECT_THAT(lsetxattr(link.path().c_str(), name, nullptr, 0, 0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(lgetxattr(link.path().c_str(), nullptr, nullptr, 0),
+  EXPECT_THAT(lgetxattr(link.path().c_str(), name, nullptr, 0),
               SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(llistxattr(link.path().c_str(), nullptr, 0),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(lremovexattr(link.path().c_str(), name),
+              SyscallFailsWithErrno(EPERM));
 }
 
-TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
+TEST_F(XattrTest, LXattrOnNonsymlink) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -558,13 +572,16 @@ TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
   EXPECT_THAT(lgetxattr(path, name, &buf, size),
               SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
-}
 
-TEST_F(XattrTest, FGetSetxattr) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+  char list[sizeof(name)];
+  EXPECT_THAT(llistxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
 
+  EXPECT_THAT(lremovexattr(path, name), SyscallSucceeds());
+}
+
+TEST_F(XattrTest, XattrWithFD) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
   const char name[] = "user.test";
@@ -577,6 +594,13 @@ TEST_F(XattrTest, FGetSetxattr) {
   EXPECT_THAT(fgetxattr(fd.get(), name, &buf, size),
               SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
+
+  char list[sizeof(name)];
+  EXPECT_THAT(flistxattr(fd.get(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(fremovexattr(fd.get(), name), SyscallSucceeds());
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 75412ed9f5b6b327dec05ffff99d7fe6198d25a8 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Mon, 10 Feb 2020 10:28:56 -0800
Subject: Internal change.

PiperOrigin-RevId: 294250370
---
 test/syscalls/linux/inotify.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index fdef646eb..0e13ad190 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -1055,9 +1055,9 @@ TEST(Inotify, ChmodGeneratesAttribEvent_NoRandomSave) {
   const TempPath file1 =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
 
-  const FileDescriptor root_fd =
+  FileDescriptor root_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(root.path(), O_RDONLY));
-  const FileDescriptor file1_fd =
+  FileDescriptor file1_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
 
@@ -1091,6 +1091,11 @@ TEST(Inotify, ChmodGeneratesAttribEvent_NoRandomSave) {
   ASSERT_THAT(fchmodat(root_fd.get(), file1_basename.c_str(), S_IWGRP, 0),
               SyscallSucceeds());
   verify_chmod_events();
+
+  // Make sure the chmod'ed file descriptors are destroyed before DisableSave
+  // is destructed.
+  root_fd.reset();
+  file1_fd.reset();
 }
 
 TEST(Inotify, TruncateGeneratesModifyEvent) {
-- 
cgit v1.2.3


From 46a36b64d5164d1ac887aa528d23bb2f2c74489e Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 12 Feb 2020 06:35:20 -0800
Subject: Include more test files in exports_files

So that they can be included by Fuchsia's syscall tests

PiperOrigin-RevId: 294654890
---
 test/syscalls/linux/BUILD | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ca1af209a..e7c82adfc 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -10,13 +10,16 @@ exports_files(
         "socket.cc",
         "socket_inet_loopback.cc",
         "socket_ip_loopback_blocking.cc",
+        "socket_ip_tcp_generic_loopback.cc",
         "socket_ip_tcp_loopback.cc",
+        "socket_ip_tcp_udp_generic.cc",
         "socket_ip_udp_loopback.cc",
         "socket_ip_unbound.cc",
         "socket_ipv4_tcp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
+        "udp_bind.cc",
         "udp_socket.cc",
     ],
     visibility = ["//:sandbox"],
-- 
cgit v1.2.3


From 69bf39e8a47d3b4dcbbd04d2e8df476cdfab5e74 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 13 Feb 2020 10:58:47 -0800
Subject: Internal change.

PiperOrigin-RevId: 294952610
---
 pkg/abi/linux/socket.go                        | 13 ++++
 pkg/sentry/socket/control/BUILD                |  1 +
 pkg/sentry/socket/control/control.go           | 43 +++++++++++++
 pkg/sentry/socket/hostinet/socket.go           | 11 +++-
 pkg/sentry/socket/netstack/netstack.go         | 37 ++++++++++--
 pkg/tcpip/tcpip.go                             | 25 ++++++++
 pkg/tcpip/transport/udp/endpoint.go            | 26 ++++++++
 test/syscalls/linux/socket_ip_udp_generic.cc   | 44 ++++++++++++++
 test/syscalls/linux/socket_ipv4_udp_unbound.cc | 84 ++++++++++++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.cc   |  1 -
 10 files changed, 278 insertions(+), 7 deletions(-)

(limited to 'test/syscalls')

diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 766ee4014..4a14ef691 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -411,6 +411,15 @@ type ControlMessageCredentials struct {
 	GID uint32
 }
 
+// A ControlMessageIPPacketInfo is IP_PKTINFO socket control message.
+//
+// ControlMessageIPPacketInfo represents struct in_pktinfo from linux/in.h.
+type ControlMessageIPPacketInfo struct {
+	NIC             int32
+	LocalAddr       InetAddr
+	DestinationAddr InetAddr
+}
+
 // SizeOfControlMessageCredentials is the binary size of a
 // ControlMessageCredentials struct.
 var SizeOfControlMessageCredentials = int(binary.Size(ControlMessageCredentials{}))
@@ -431,6 +440,10 @@ const SizeOfControlMessageTOS = 1
 // SizeOfControlMessageTClass is the size of an IPV6_TCLASS control message.
 const SizeOfControlMessageTClass = 4
 
+// SizeOfControlMessageIPPacketInfo is the size of an IP_PKTINFO
+// control message.
+const SizeOfControlMessageIPPacketInfo = 12
+
 // SCM_MAX_FD is the maximum number of FDs accepted in a single sendmsg call.
 // From net/scm.h.
 const SCM_MAX_FD = 253
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 79e16d6e8..4d42d29cb 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/syserror",
+        "//pkg/tcpip",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 6145a7fc3..4667373d2 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -338,6 +339,22 @@ func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
 	)
 }
 
+// PackIPPacketInfo packs an IP_PKTINFO socket control message.
+func PackIPPacketInfo(t *kernel.Task, packetInfo tcpip.IPPacketInfo, buf []byte) []byte {
+	var p linux.ControlMessageIPPacketInfo
+	p.NIC = int32(packetInfo.NIC)
+	copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr))
+	copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr))
+
+	return putCmsgStruct(
+		buf,
+		linux.SOL_IP,
+		linux.IP_PKTINFO,
+		t.Arch().Width(),
+		p,
+	)
+}
+
 // PackControlMessages packs control messages into the given buffer.
 //
 // We skip control messages specific to Unix domain sockets.
@@ -362,6 +379,10 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt
 		buf = PackTClass(t, cmsgs.IP.TClass, buf)
 	}
 
+	if cmsgs.IP.HasIPPacketInfo {
+		buf = PackIPPacketInfo(t, cmsgs.IP.PacketInfo, buf)
+	}
+
 	return buf
 }
 
@@ -394,6 +415,16 @@ func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int {
 	return space
 }
 
+// NewIPPacketInfo returns the IPPacketInfo struct.
+func NewIPPacketInfo(packetInfo linux.ControlMessageIPPacketInfo) tcpip.IPPacketInfo {
+	var p tcpip.IPPacketInfo
+	p.NIC = tcpip.NICID(packetInfo.NIC)
+	copy([]byte(p.LocalAddr), packetInfo.LocalAddr[:])
+	copy([]byte(p.DestinationAddr), packetInfo.DestinationAddr[:])
+
+	return p
+}
+
 // Parse parses a raw socket control message into portable objects.
 func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) {
 	var (
@@ -468,6 +499,18 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
 				i += binary.AlignUp(length, width)
 
+			case linux.IP_PKTINFO:
+				if length < linux.SizeOfControlMessageIPPacketInfo {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
+
+				cmsgs.IP.HasIPPacketInfo = true
+				var packetInfo linux.ControlMessageIPPacketInfo
+				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo)
+
+				cmsgs.IP.PacketInfo = NewIPPacketInfo(packetInfo)
+				i += binary.AlignUp(length, width)
+
 			default:
 				return socket.ControlMessages{}, syserror.EINVAL
 			}
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index de76388ac..22f78d2e2 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -289,7 +289,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	switch level {
 	case linux.SOL_IP:
 		switch name {
-		case linux.IP_TOS, linux.IP_RECVTOS:
+		case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO:
 			optlen = sizeofInt32
 		}
 	case linux.SOL_IPV6:
@@ -336,6 +336,8 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 		switch name {
 		case linux.IP_TOS, linux.IP_RECVTOS:
 			optlen = sizeofInt32
+		case linux.IP_PKTINFO:
+			optlen = linux.SizeOfControlMessageIPPacketInfo
 		}
 	case linux.SOL_IPV6:
 		switch name {
@@ -473,7 +475,14 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			case syscall.IP_TOS:
 				controlMessages.IP.HasTOS = true
 				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS)
+
+			case syscall.IP_PKTINFO:
+				controlMessages.IP.HasIPPacketInfo = true
+				var packetInfo linux.ControlMessageIPPacketInfo
+				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo)
+				controlMessages.IP.PacketInfo = control.NewIPPacketInfo(packetInfo)
 			}
+
 		case syscall.SOL_IPV6:
 			switch unixCmsg.Header.Type {
 			case syscall.IPV6_TCLASS:
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index ed2fbcceb..9757fbfba 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1414,6 +1414,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return o, nil
 
+	case linux.IP_PKTINFO:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveIPPacketInfoOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1762,6 +1777,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		linux.IPV6_IPSEC_POLICY,
 		linux.IPV6_JOIN_ANYCAST,
 		linux.IPV6_LEAVE_ANYCAST,
+		// TODO(b/148887420): Add support for IPV6_PKTINFO.
 		linux.IPV6_PKTINFO,
 		linux.IPV6_ROUTER_ALERT,
 		linux.IPV6_XFRM_POLICY,
@@ -1949,6 +1965,16 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0))
 
+	case linux.IP_PKTINFO:
+		if len(optVal) == 0 {
+			return nil
+		}
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1964,7 +1990,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_NODEFRAG,
 		linux.IP_OPTIONS,
 		linux.IP_PASSSEC,
-		linux.IP_PKTINFO,
 		linux.IP_RECVERR,
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
@@ -2395,10 +2420,12 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 func (s *SocketOperations) controlMessages() socket.ControlMessages {
 	return socket.ControlMessages{
 		IP: tcpip.ControlMessages{
-			HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp,
-			Timestamp:    s.readCM.Timestamp,
-			HasTOS:       s.readCM.HasTOS,
-			TOS:          s.readCM.TOS,
+			HasTimestamp:    s.readCM.HasTimestamp && s.sockOptTimestamp,
+			Timestamp:       s.readCM.Timestamp,
+			HasTOS:          s.readCM.HasTOS,
+			TOS:             s.readCM.TOS,
+			HasIPPacketInfo: s.readCM.HasIPPacketInfo,
+			PacketInfo:      s.readCM.PacketInfo,
 		},
 	}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 0e944712f..9ca39ce40 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -328,6 +328,12 @@ type ControlMessages struct {
 
 	// Tclass is the IPv6 traffic class of the associated packet.
 	TClass int32
+
+	// HasIPPacketInfo indicates whether PacketInfo is set.
+	HasIPPacketInfo bool
+
+	// PacketInfo holds interface and address data on an incoming packet.
+	PacketInfo IPPacketInfo
 }
 
 // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
@@ -503,6 +509,11 @@ const (
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
 	V6OnlyOption
+
+	// ReceiveIPPacketInfoOption is used by {G,S}etSockOptBool to specify
+	// if more inforamtion is provided with incoming packets such
+	// as interface index and address.
+	ReceiveIPPacketInfoOption
 )
 
 // SockOptInt represents socket options which values have the int type.
@@ -685,6 +696,20 @@ type IPv4TOSOption uint8
 // for all subsequent outgoing IPv6 packets from the endpoint.
 type IPv6TrafficClassOption uint8
 
+// IPPacketInfo is the message struture for IP_PKTINFO.
+//
+// +stateify savable
+type IPPacketInfo struct {
+	// NIC is the ID of the NIC to be used.
+	NIC NICID
+
+	// LocalAddr is the local address.
+	LocalAddr Address
+
+	// DestinationAddr is the destination address.
+	DestinationAddr Address
+}
+
 // Route is a row in the routing table. It specifies through which NIC (and
 // gateway) sets of packets should be routed. A row is considered viable if the
 // masked target address matches the destination address in the row.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index c9cbed8f4..3fe91cac2 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -29,6 +29,7 @@ import (
 type udpPacket struct {
 	udpPacketEntry
 	senderAddress tcpip.FullAddress
+	packetInfo    tcpip.IPPacketInfo
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
 	tos           uint8
@@ -118,6 +119,9 @@ type endpoint struct {
 	// as ancillary data to ControlMessages on Read.
 	receiveTOS bool
 
+	// receiveIPPacketInfo determines if the packet info is returned by Read.
+	receiveIPPacketInfo bool
+
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -254,11 +258,17 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	}
 	e.mu.RLock()
 	receiveTOS := e.receiveTOS
+	receiveIPPacketInfo := e.receiveIPPacketInfo
 	e.mu.RUnlock()
 	if receiveTOS {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
+
+	if receiveIPPacketInfo {
+		cm.HasIPPacketInfo = true
+		cm.PacketInfo = p.packetInfo
+	}
 	return p.data.ToView(), cm, nil
 }
 
@@ -495,6 +505,13 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		}
 
 		e.v6only = v
+		return nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.Lock()
+		e.receiveIPPacketInfo = v
+		e.mu.Unlock()
+		return nil
 	}
 
 	return nil
@@ -703,6 +720,12 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 
 		return v, nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.RLock()
+		v := e.receiveIPPacketInfo
+		e.mu.RUnlock()
+		return v, nil
 	}
 
 	return false, tcpip.ErrUnknownProtocolOption
@@ -1247,6 +1270,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	switch r.NetProto {
 	case header.IPv4ProtocolNumber:
 		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+		packet.packetInfo.LocalAddr = r.LocalAddress
+		packet.packetInfo.DestinationAddr = r.RemoteAddress
+		packet.packetInfo.NIC = r.NICID()
 	}
 
 	packet.timestamp = e.stack.NowNanoseconds()
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 53290bed7..db5663ecd 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -357,5 +357,49 @@ TEST_P(UDPSocketPairTest, SetReuseAddrReusePort) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
+// Test getsockopt for a socket which is not set with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, IPPKTINFODefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_IP, IP_PKTINFO, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test setsockopt and getsockopt for a socket with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int level = SOL_IP;
+  int type = IP_PKTINFO;
+
+  // Check getsockopt before IP_PKTINFO is set.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOn);
+  EXPECT_EQ(get_len, sizeof(get));
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOff);
+  EXPECT_EQ(get_len, sizeof(get));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 990ccf23c..bc4b07a62 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -15,6 +15,7 @@
 #include "test/syscalls/linux/socket_ipv4_udp_unbound.h"
 
 #include <arpa/inet.h>
+#include <net/if.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -2128,5 +2129,88 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
               SyscallSucceedsWithValue(kMessageSize));
 }
 
+// Test that socket will receive packet info control message.
+TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF((IsRunningWithHostinet()));
+
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto sender_addr = V4Loopback();
+  int level = SOL_IP;
+  int type = IP_PKTINFO;
+
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+           sender_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t sender_addr_len = sender_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&sender_addr.addr),
+                          &sender_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(sender_addr_len, sender_addr.addr_len);
+
+  auto receiver_addr = V4Loopback();
+  reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&sender_addr.addr)->sin_port;
+  ASSERT_THAT(
+      connect(sender->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+              receiver_addr.addr_len),
+      SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  ASSERT_THAT(
+      setsockopt(receiver->get(), level, type, &kSockOptOn, sizeof(kSockOptOn)),
+      SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  msghdr sent_msg = {};
+  iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = sent_data;
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+  sent_msg.msg_flags = 0;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sender->get(), &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  msghdr received_msg = {};
+  iovec received_iov = {};
+  char received_data[kDataLength];
+  char received_cmsg_buf[CMSG_SPACE(sizeof(in_pktinfo))] = {};
+  size_t cmsg_data_len = sizeof(in_pktinfo);
+  received_iov.iov_base = received_data;
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+  received_msg.msg_control = received_cmsg_buf;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(receiver->get(), &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, level);
+  EXPECT_EQ(cmsg->cmsg_type, type);
+
+  // Get loopback index.
+  ifreq ifr = {};
+  absl::SNPrintF(ifr.ifr_name, IFNAMSIZ, "lo");
+  ASSERT_THAT(ioctl(sender->get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  ASSERT_NE(ifr.ifr_ifindex, 0);
+
+  // Check the data
+  in_pktinfo received_pktinfo = {};
+  memcpy(&received_pktinfo, CMSG_DATA(cmsg), sizeof(in_pktinfo));
+  EXPECT_EQ(received_pktinfo.ipi_ifindex, ifr.ifr_ifindex);
+  EXPECT_EQ(received_pktinfo.ipi_spec_dst.s_addr, htonl(INADDR_LOOPBACK));
+  EXPECT_EQ(received_pktinfo.ipi_addr.s_addr, htonl(INADDR_LOOPBACK));
+}
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index a2f6ef8cc..9f8de6b48 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1495,6 +1495,5 @@ TEST_P(UdpSocketTest, SendAndReceiveTOS) {
   memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
   EXPECT_EQ(received_tos, sent_tos);
 }
-
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From c841373013ec8659b2954563796479f275b00bfa Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 12:00:58 -0800
Subject: Deflake fallocate syscall test.

- Retry if fallocate returns EINTR.

- If fallocate fails, don't try to fstat and confirm the result.

PiperOrigin-RevId: 295789790
---
 test/syscalls/linux/fallocate.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
index 1c3d00287..7819f4ac3 100644
--- a/test/syscalls/linux/fallocate.cc
+++ b/test/syscalls/linux/fallocate.cc
@@ -33,7 +33,7 @@ namespace testing {
 namespace {
 
 int fallocate(int fd, int mode, off_t offset, off_t len) {
-  return syscall(__NR_fallocate, fd, mode, offset, len);
+  return RetryEINTR(syscall)(__NR_fallocate, fd, mode, offset, len);
 }
 
 class AllocateTest : public FileTest {
@@ -47,27 +47,27 @@ TEST_F(AllocateTest, Fallocate) {
   EXPECT_EQ(buf.st_size, 0);
 
   // Grow to ten bytes.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 10), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 10), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 10);
 
   // Allocate to a smaller size should be noop.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 5), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 5), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 10);
 
   // Grow again.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 20), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 20), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 20);
 
   // Grow with offset.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 10, 20), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 10, 20), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 30);
 
   // Grow with offset beyond EOF.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 40);
 }
-- 
cgit v1.2.3


From 56fd9504aab44a738d3df164cbee8e572b309f28 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 15:44:22 -0800
Subject: Enable IPV6_RECVTCLASS socket option for datagram sockets

Added the ability to get/set the IP_RECVTCLASS socket option on UDP endpoints.
If enabled, traffic class from the incoming Network Header passed as ancillary
data in the ControlMessages.

Adding Get/SetSockOptBool to decrease the overhead of getting/setting simple
options. (This was absorbed in a CL that will be landing before this one).

Test:
* Added unit test to udp_test.go that tests getting/setting as well as
verifying that we receive expected TOS from incoming packet.
* Added a syscall test for verifying getting/setting
* Removed test skip for existing syscall test to enable end to end test.
PiperOrigin-RevId: 295840218
---
 pkg/sentry/socket/control/control.go         |   2 +-
 pkg/sentry/socket/netstack/netstack.go       |  27 +++++-
 pkg/tcpip/checker/checker.go                 |  14 +++
 pkg/tcpip/tcpip.go                           |  15 ++-
 pkg/tcpip/transport/udp/endpoint.go          |  38 +++++++-
 pkg/tcpip/transport/udp/udp_test.go          | 120 ++++++++++++++----------
 test/syscalls/linux/ip_socket_test_util.h    |  16 ++--
 test/syscalls/linux/socket_ip_udp_generic.cc | 133 +++++++++++++++++++--------
 test/syscalls/linux/udp_socket_test_cases.cc |   4 -
 9 files changed, 260 insertions(+), 109 deletions(-)

(limited to 'test/syscalls')

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 4667373d2..8834a1e1a 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -329,7 +329,7 @@ func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
 }
 
 // PackTClass packs an IPV6_TCLASS socket control message.
-func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
+func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IPV6,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 9757fbfba..e187276c5 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1318,6 +1318,22 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		}
 		return ib, nil
 
+	case linux.IPV6_RECVTCLASS:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveTClassOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -1803,6 +1819,14 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v)))
 
+	case linux.IPV6_RECVTCLASS:
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0))
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -2086,7 +2110,6 @@ func emitUnimplementedEventIPv6(t *kernel.Task, name int) {
 		linux.IPV6_RECVPATHMTU,
 		linux.IPV6_RECVPKTINFO,
 		linux.IPV6_RECVRTHDR,
-		linux.IPV6_RECVTCLASS,
 		linux.IPV6_RTHDR,
 		linux.IPV6_RTHDRDSTOPTS,
 		linux.IPV6_TCLASS,
@@ -2424,6 +2447,8 @@ func (s *SocketOperations) controlMessages() socket.ControlMessages {
 			Timestamp:       s.readCM.Timestamp,
 			HasTOS:          s.readCM.HasTOS,
 			TOS:             s.readCM.TOS,
+			HasTClass:       s.readCM.HasTClass,
+			TClass:          s.readCM.TClass,
 			HasIPPacketInfo: s.readCM.HasIPPacketInfo,
 			PacketInfo:      s.readCM.PacketInfo,
 		},
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 4d6ae0871..c6c160dfc 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -161,6 +161,20 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
+// ReceiveTClass creates a checker that checks the TCLASS field in
+// ControlMessages.
+func ReceiveTClass(want uint32) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTClass {
+			t.Fatalf("got cm.HasTClass = %t, want cm.TClass = %d", cm.HasTClass, want)
+		}
+		if got := cm.TClass; got != want {
+			t.Fatalf("got cm.TClass = %d, want %d", got, want)
+		}
+	}
+}
+
 // ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
 func ReceiveTOS(want uint8) ControlMessagesChecker {
 	return func(t *testing.T, cm tcpip.ControlMessages) {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 9ca39ce40..ce5527391 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -323,11 +323,11 @@ type ControlMessages struct {
 	// TOS is the IPv4 type of service of the associated packet.
 	TOS uint8
 
-	// HasTClass indicates whether Tclass is valid/set.
+	// HasTClass indicates whether TClass is valid/set.
 	HasTClass bool
 
-	// Tclass is the IPv6 traffic class of the associated packet.
-	TClass int32
+	// TClass is the IPv6 traffic class of the associated packet.
+	TClass uint32
 
 	// HasIPPacketInfo indicates whether PacketInfo is set.
 	HasIPPacketInfo bool
@@ -502,9 +502,13 @@ type WriteOptions struct {
 type SockOptBool int
 
 const (
+	// ReceiveTClassOption is used by SetSockOpt/GetSockOpt to specify if the
+	// IPV6_TCLASS ancillary message is passed with incoming packets.
+	ReceiveTClassOption SockOptBool = iota
+
 	// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
 	// ancillary message is passed with incoming packets.
-	ReceiveTOSOption SockOptBool = iota
+	ReceiveTOSOption
 
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
@@ -514,6 +518,9 @@ const (
 	// if more inforamtion is provided with incoming packets such
 	// as interface index and address.
 	ReceiveIPPacketInfoOption
+
+	// TODO(b/146901447): convert existing bool socket options to be handled via
+	// Get/SetSockOptBool
 )
 
 // SockOptInt represents socket options which values have the int type.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 3fe91cac2..eff7f3600 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -32,7 +32,8 @@ type udpPacket struct {
 	packetInfo    tcpip.IPPacketInfo
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	tos           uint8
+	// tos stores either the receiveTOS or receiveTClass value.
+	tos uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -119,6 +120,10 @@ type endpoint struct {
 	// as ancillary data to ControlMessages on Read.
 	receiveTOS bool
 
+	// receiveTClass determines if the incoming IPv6 TClass header field is
+	// passed as ancillary data to ControlMessages on Read.
+	receiveTClass bool
+
 	// receiveIPPacketInfo determines if the packet info is returned by Read.
 	receiveIPPacketInfo bool
 
@@ -258,13 +263,18 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	}
 	e.mu.RLock()
 	receiveTOS := e.receiveTOS
+	receiveTClass := e.receiveTClass
 	receiveIPPacketInfo := e.receiveIPPacketInfo
 	e.mu.RUnlock()
 	if receiveTOS {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
-
+	if receiveTClass {
+		cm.HasTClass = true
+		// Although TClass is an 8-bit value it's read in the CMsg as a uint32.
+		cm.TClass = uint32(p.tos)
+	}
 	if receiveIPPacketInfo {
 		cm.HasIPPacketInfo = true
 		cm.PacketInfo = p.packetInfo
@@ -490,6 +500,17 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return tcpip.ErrNotSupported
+		}
+
+		e.mu.Lock()
+		e.receiveTClass = v
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -709,6 +730,17 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 		return v, nil
 
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return false, tcpip.ErrNotSupported
+		}
+
+		e.mu.RLock()
+		v := e.receiveTClass
+		e.mu.RUnlock()
+		return v, nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -1273,6 +1305,8 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		packet.packetInfo.LocalAddr = r.LocalAddress
 		packet.packetInfo.DestinationAddr = r.RemoteAddress
 		packet.packetInfo.NIC = r.NICID()
+	case header.IPv6ProtocolNumber:
+		packet.tos, _ = header.IPv6(pkt.NetworkHeader).TOS()
 	}
 
 	packet.timestamp = e.stack.NowNanoseconds()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index f0ff3fe71..34b7c2360 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -409,6 +409,7 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	// Initialize the IP header.
 	ip := header.IPv6(buf)
 	ip.Encode(&header.IPv6Fields{
+		TrafficClass:  testTOS,
 		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
 		NextHeader:    uint8(udp.ProtocolNumber),
 		HopLimit:      65,
@@ -1336,7 +1337,7 @@ func TestSetTTL(t *testing.T) {
 	}
 }
 
-func TestTOSV4(t *testing.T) {
+func TestSetTOS(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4, multicastV4, broadcast} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1347,23 +1348,23 @@ func TestTOSV4(t *testing.T) {
 			const tos = testTOS
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
 			}
 
 			if err := c.ep.SetSockOpt(tcpip.IPv4TOSOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.IPv4TOSOption(tos), err)
+				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv4TOSOption(tos), err)
 			}
 
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 
 			if want := tcpip.IPv4TOSOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
 			}
 
 			testWrite(c, flow, checker.TOS(tos, 0))
@@ -1371,7 +1372,7 @@ func TestTOSV4(t *testing.T) {
 	}
 }
 
-func TestTOSV6(t *testing.T) {
+func TestSetTClass(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, broadcastIn6} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1379,71 +1380,92 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = testTOS
+			const tClass = testTOS
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
 			}
 
-			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt failed: %s", err)
+			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tClass)); err != nil {
+				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv6TrafficClassOption(tClass), err)
 			}
 
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 
-			if want := tcpip.IPv6TrafficClassOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+			if want := tcpip.IPv6TrafficClassOption(tClass); v != want {
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
 			}
 
-			testWrite(c, flow, checker.TOS(tos, 0))
+			// The header getter for TClass is called TOS, so use that checker.
+			testWrite(c, flow, checker.TOS(tClass, 0))
 		})
 	}
 }
 
-func TestReceiveTOSV4(t *testing.T) {
-	for _, flow := range []testFlow{unicastV4, broadcast} {
-		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
-			c := newDualTestContext(t, defaultMTU)
-			defer c.cleanup()
+func TestReceiveTosTClass(t *testing.T) {
+	testCases := []struct {
+		name             string
+		getReceiveOption tcpip.SockOptBool
+		tests            []testFlow
+	}{
+		{"ReceiveTosOption", tcpip.ReceiveTOSOption, []testFlow{unicastV4, broadcast}},
+		{"ReceiveTClassOption", tcpip.ReceiveTClassOption, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}},
+	}
+	for _, testCase := range testCases {
+		for _, flow := range testCase.tests {
+			t.Run(fmt.Sprintf("%s:flow:%s", testCase.name, flow), func(t *testing.T) {
+				c := newDualTestContext(t, defaultMTU)
+				defer c.cleanup()
 
-			c.createEndpointForFlow(flow)
+				c.createEndpointForFlow(flow)
+				option := testCase.getReceiveOption
+				name := testCase.name
 
-			// Verify that setting and reading the option works.
-			v, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			// Test for expected default value.
-			if v != false {
-				c.t.Errorf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", v, false)
-			}
+				// Verify that setting and reading the option works.
+				v, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+				}
+				// Test for expected default value.
+				if v != false {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, v, false)
+				}
 
-			want := true
-			if err := c.ep.SetSockOptBool(tcpip.ReceiveTOSOption, want); err != nil {
-				c.t.Fatalf("SetSockOptBool(tcpip.ReceiveTOSOption, %t) failed: %s", want, err)
-			}
+				want := true
+				if err := c.ep.SetSockOptBool(option, want); err != nil {
+					c.t.Fatalf("SetSockOptBool(%s, %t) failed: %s", name, want, err)
+				}
 
-			got, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			if got != want {
-				c.t.Fatalf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", got, want)
-			}
+				got, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+				}
 
-			// Verify that the correct received TOS is handed through as
-			// ancillary data to the ControlMessages struct.
-			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-				c.t.Fatal("Bind failed:", err)
-			}
-			testRead(c, flow, checker.ReceiveTOS(testTOS))
-		})
+				if got != want {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, got, want)
+				}
+
+				// Verify that the correct received TOS or TClass is handed through as
+				// ancillary data to the ControlMessages struct.
+				if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+					c.t.Fatalf("Bind failed: %s", err)
+				}
+				switch option {
+				case tcpip.ReceiveTClassOption:
+					testRead(c, flow, checker.ReceiveTClass(testTOS))
+				case tcpip.ReceiveTOSOption:
+					testRead(c, flow, checker.ReceiveTOS(testTOS))
+				default:
+					t.Fatalf("unknown test variant: %s", name)
+				}
+			})
+		}
 	}
 }
 
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 083ebbcf0..39fd6709d 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -84,20 +84,20 @@ SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type);
 // SocketPairs created with AF_INET and the given type.
 SocketPairKind IPv4UDPUnboundSocketPair(int type);
 
-// IPv4UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_DGRAM, and the given type.
+// IPv4UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_DGRAM, and the given type.
 SocketKind IPv4UDPUnboundSocket(int type);
 
-// IPv6UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_DGRAM, and the given type.
+// IPv6UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_DGRAM, and the given type.
 SocketKind IPv6UDPUnboundSocket(int type);
 
-// IPv4TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_STREAM and the given type.
+// IPv4TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_STREAM and the given type.
 SocketKind IPv4TCPUnboundSocket(int type);
 
-// IPv6TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_STREAM and the given type.
+// IPv6TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_STREAM and the given type.
 SocketKind IPv6TCPUnboundSocket(int type);
 
 // IfAddrHelper is a helper class that determines the local interfaces present
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index db5663ecd..1c533fdf2 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -14,6 +14,7 @@
 
 #include "test/syscalls/linux/socket_ip_udp_generic.h"
 
+#include <errno.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <poll.h>
@@ -209,46 +210,6 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
-// Ensure that Receiving TOS is off by default.
-TEST_P(UDPSocketPairTest, RecvTosDefault) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-}
-
-// Test that setting and getting IP_RECVTOS works as expected.
-TEST_P(UDPSocketPairTest, SetRecvTos) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOff, sizeof(kSockOptOff)),
-              SyscallSucceeds());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOn, sizeof(kSockOptOn)),
-              SyscallSucceeds());
-
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOn);
-}
-
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -401,5 +362,97 @@ TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
   EXPECT_EQ(get_len, sizeof(get));
 }
 
+// Holds TOS or TClass information for IPv4 or IPv6 respectively.
+struct RecvTosOption {
+  int level;
+  int option;
+};
+
+RecvTosOption GetRecvTosOption(int domain) {
+  TEST_CHECK(domain == AF_INET || domain == AF_INET6);
+  RecvTosOption opt;
+  switch (domain) {
+    case AF_INET:
+      opt.level = IPPROTO_IP;
+      opt.option = IP_RECVTOS;
+      break;
+    case AF_INET6:
+      opt.level = IPPROTO_IPV6;
+      opt.option = IPV6_RECVTCLASS;
+      break;
+  }
+  return opt;
+}
+
+// Ensure that Receiving TOS or TCLASS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS or IPV6_RECVTCLASS works as
+// expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
+// Test that any socket (including IPv6 only) accepts the IPv4 TOS option: this
+// mirrors behavior in linux.
+TEST_P(UDPSocketPairTest, TOSRecvMismatch) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(AF_INET);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+}
+
+// Test that an IPv4 socket does not support the IPv6 TClass option.
+TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
+  // This should only test AF_INET sockets for the mismatch behavior.
+  SKIP_IF(GetParam().domain != AF_INET);
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IPV6, IPV6_RECVTCLASS,
+                         &get, &get_len),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 9f8de6b48..57b1a357c 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,9 +1349,6 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
-          !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1422,7 +1419,6 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
   // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From 55c99ce106e03c419729318947e0be477ed181d0 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 19 Feb 2020 12:31:43 -0800
Subject: Include more test files in exports_files

So that they can be included by Fuchsia's syscall tests

PiperOrigin-RevId: 296030383
---
 test/syscalls/linux/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e7c82adfc..05a818795 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -12,8 +12,12 @@ exports_files(
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_generic_loopback.cc",
         "socket_ip_tcp_loopback.cc",
+        "socket_ip_tcp_loopback_blocking.cc",
+        "socket_ip_tcp_loopback_nonblock.cc",
         "socket_ip_tcp_udp_generic.cc",
         "socket_ip_udp_loopback.cc",
+        "socket_ip_udp_loopback_blocking.cc",
+        "socket_ip_udp_loopback_nonblock.cc",
         "socket_ip_unbound.cc",
         "socket_ipv4_tcp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_external_networking_test.cc",
-- 
cgit v1.2.3


From 30794512d3977ebb2b185e5e9cfb969d558a07a4 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 19 Feb 2020 18:20:52 -0800
Subject: Add basic microbenchmarks.

PiperOrigin-RevId: 296104390
---
 WORKSPACE                                  |  10 +
 test/perf/BUILD                            | 114 +++++++
 test/perf/linux/BUILD                      | 356 +++++++++++++++++++++
 test/perf/linux/clock_getres_benchmark.cc  |  39 +++
 test/perf/linux/clock_gettime_benchmark.cc |  60 ++++
 test/perf/linux/death_benchmark.cc         |  36 +++
 test/perf/linux/epoll_benchmark.cc         |  99 ++++++
 test/perf/linux/fork_benchmark.cc          | 350 +++++++++++++++++++++
 test/perf/linux/futex_benchmark.cc         | 248 +++++++++++++++
 test/perf/linux/getdents_benchmark.cc      | 149 +++++++++
 test/perf/linux/getpid_benchmark.cc        |  37 +++
 test/perf/linux/gettid_benchmark.cc        |  38 +++
 test/perf/linux/mapping_benchmark.cc       | 163 ++++++++++
 test/perf/linux/open_benchmark.cc          |  56 ++++
 test/perf/linux/pipe_benchmark.cc          |  66 ++++
 test/perf/linux/randread_benchmark.cc      | 100 ++++++
 test/perf/linux/read_benchmark.cc          |  53 ++++
 test/perf/linux/sched_yield_benchmark.cc   |  37 +++
 test/perf/linux/send_recv_benchmark.cc     | 372 ++++++++++++++++++++++
 test/perf/linux/seqwrite_benchmark.cc      |  66 ++++
 test/perf/linux/signal_benchmark.cc        |  59 ++++
 test/perf/linux/sleep_benchmark.cc         |  60 ++++
 test/perf/linux/stat_benchmark.cc          |  62 ++++
 test/perf/linux/unlink_benchmark.cc        |  66 ++++
 test/perf/linux/write_benchmark.cc         |  52 ++++
 test/runner/BUILD                          |  22 ++
 test/runner/defs.bzl                       | 218 +++++++++++++
 test/runner/gtest/BUILD                    |   9 +
 test/runner/gtest/gtest.go                 | 154 +++++++++
 test/runner/runner.go                      | 477 ++++++++++++++++++++++++++++
 test/syscalls/BUILD                        |  21 +-
 test/syscalls/build_defs.bzl               | 180 -----------
 test/syscalls/gtest/BUILD                  |   9 -
 test/syscalls/gtest/gtest.go               |  93 ------
 test/syscalls/linux/alarm.cc               |   3 +-
 test/syscalls/linux/exec.cc                |   3 +-
 test/syscalls/linux/fcntl.cc               |   2 +-
 test/syscalls/linux/itimer.cc              |   3 +-
 test/syscalls/linux/prctl.cc               |   2 +-
 test/syscalls/linux/prctl_setuid.cc        |   2 +-
 test/syscalls/linux/proc.cc                |   2 +-
 test/syscalls/linux/ptrace.cc              |   2 +-
 test/syscalls/linux/rtsignal.cc            |   3 +-
 test/syscalls/linux/seccomp.cc             |   2 +-
 test/syscalls/linux/sigiret.cc             |   3 +-
 test/syscalls/linux/signalfd.cc            |   2 +-
 test/syscalls/linux/sigstop.cc             |   2 +-
 test/syscalls/linux/sigtimedwait.cc        |   3 +-
 test/syscalls/linux/timers.cc              |   2 +-
 test/syscalls/linux/vfork.cc               |   2 +-
 test/syscalls/syscall_test_runner.go       | 482 -----------------------------
 test/syscalls/syscall_test_runner.sh       |  34 --
 test/util/BUILD                            |   3 +-
 test/util/test_main.cc                     |   2 +-
 test/util/test_util.h                      |   1 +
 test/util/test_util_impl.cc                |  14 +
 tools/bazeldefs/defs.bzl                   |   1 +
 tools/defs.bzl                             |   3 +-
 58 files changed, 3666 insertions(+), 843 deletions(-)
 create mode 100644 test/perf/BUILD
 create mode 100644 test/perf/linux/BUILD
 create mode 100644 test/perf/linux/clock_getres_benchmark.cc
 create mode 100644 test/perf/linux/clock_gettime_benchmark.cc
 create mode 100644 test/perf/linux/death_benchmark.cc
 create mode 100644 test/perf/linux/epoll_benchmark.cc
 create mode 100644 test/perf/linux/fork_benchmark.cc
 create mode 100644 test/perf/linux/futex_benchmark.cc
 create mode 100644 test/perf/linux/getdents_benchmark.cc
 create mode 100644 test/perf/linux/getpid_benchmark.cc
 create mode 100644 test/perf/linux/gettid_benchmark.cc
 create mode 100644 test/perf/linux/mapping_benchmark.cc
 create mode 100644 test/perf/linux/open_benchmark.cc
 create mode 100644 test/perf/linux/pipe_benchmark.cc
 create mode 100644 test/perf/linux/randread_benchmark.cc
 create mode 100644 test/perf/linux/read_benchmark.cc
 create mode 100644 test/perf/linux/sched_yield_benchmark.cc
 create mode 100644 test/perf/linux/send_recv_benchmark.cc
 create mode 100644 test/perf/linux/seqwrite_benchmark.cc
 create mode 100644 test/perf/linux/signal_benchmark.cc
 create mode 100644 test/perf/linux/sleep_benchmark.cc
 create mode 100644 test/perf/linux/stat_benchmark.cc
 create mode 100644 test/perf/linux/unlink_benchmark.cc
 create mode 100644 test/perf/linux/write_benchmark.cc
 create mode 100644 test/runner/BUILD
 create mode 100644 test/runner/defs.bzl
 create mode 100644 test/runner/gtest/BUILD
 create mode 100644 test/runner/gtest/gtest.go
 create mode 100644 test/runner/runner.go
 delete mode 100644 test/syscalls/build_defs.bzl
 delete mode 100644 test/syscalls/gtest/BUILD
 delete mode 100644 test/syscalls/gtest/gtest.go
 delete mode 100644 test/syscalls/syscall_test_runner.go
 delete mode 100755 test/syscalls/syscall_test_runner.sh

(limited to 'test/syscalls')

diff --git a/WORKSPACE b/WORKSPACE
index 2827c3a26..ff0196dc6 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -330,3 +330,13 @@ http_archive(
         "https://github.com/google/googletest/archive/565f1b848215b77c3732bca345fe76a0431d8b34.tar.gz",
     ],
 )
+
+http_archive(
+    name = "com_google_benchmark",
+    sha256 = "3c6a165b6ecc948967a1ead710d4a181d7b0fbcaa183ef7ea84604994966221a",
+    strip_prefix = "benchmark-1.5.0",
+    urls = [
+        "https://mirror.bazel.build/github.com/google/benchmark/archive/v1.5.0.tar.gz",
+        "https://github.com/google/benchmark/archive/v1.5.0.tar.gz",
+    ],
+)
diff --git a/test/perf/BUILD b/test/perf/BUILD
new file mode 100644
index 000000000..7a2bf10ed
--- /dev/null
+++ b/test/perf/BUILD
@@ -0,0 +1,114 @@
+load("//test/runner:defs.bzl", "syscall_test")
+
+package(licenses = ["notice"])
+
+syscall_test(
+    test = "//test/perf/linux:clock_getres_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:clock_gettime_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:death_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:epoll_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:fork_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:futex_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:getdents_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:getpid_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:gettid_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:mapping_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:open_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:pipe_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:randread_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:read_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:sched_yield_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:send_recv_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:seqwrite_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:signal_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:sleep_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:stat_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:unlink_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:write_benchmark",
+)
diff --git a/test/perf/linux/BUILD b/test/perf/linux/BUILD
new file mode 100644
index 000000000..b4e907826
--- /dev/null
+++ b/test/perf/linux/BUILD
@@ -0,0 +1,356 @@
+load("//tools:defs.bzl", "cc_binary", "gbenchmark", "gtest")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+cc_binary(
+    name = "getpid_benchmark",
+    testonly = 1,
+    srcs = [
+        "getpid_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "send_recv_benchmark",
+    testonly = 1,
+    srcs = [
+        "send_recv_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/syscalls/linux:socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_binary(
+    name = "gettid_benchmark",
+    testonly = 1,
+    srcs = [
+        "gettid_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "sched_yield_benchmark",
+    testonly = 1,
+    srcs = [
+        "sched_yield_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "clock_getres_benchmark",
+    testonly = 1,
+    srcs = [
+        "clock_getres_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "clock_gettime_benchmark",
+    testonly = 1,
+    srcs = [
+        "clock_gettime_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "open_benchmark",
+    testonly = 1,
+    srcs = [
+        "open_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "read_benchmark",
+    testonly = 1,
+    srcs = [
+        "read_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "randread_benchmark",
+    testonly = 1,
+    srcs = [
+        "randread_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/random",
+    ],
+)
+
+cc_binary(
+    name = "write_benchmark",
+    testonly = 1,
+    srcs = [
+        "write_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "seqwrite_benchmark",
+    testonly = 1,
+    srcs = [
+        "seqwrite_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/random",
+    ],
+)
+
+cc_binary(
+    name = "pipe_benchmark",
+    testonly = 1,
+    srcs = [
+        "pipe_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+    ],
+)
+
+cc_binary(
+    name = "fork_benchmark",
+    testonly = 1,
+    srcs = [
+        "fork_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_binary(
+    name = "futex_benchmark",
+    testonly = 1,
+    srcs = [
+        "futex_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "epoll_benchmark",
+    testonly = 1,
+    srcs = [
+        "epoll_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:epoll_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "death_benchmark",
+    testonly = 1,
+    srcs = [
+        "death_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "mapping_benchmark",
+    testonly = 1,
+    srcs = [
+        "mapping_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:memory_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "signal_benchmark",
+    testonly = 1,
+    srcs = [
+        "signal_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "getdents_benchmark",
+    testonly = 1,
+    srcs = [
+        "getdents_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "sleep_benchmark",
+    testonly = 1,
+    srcs = [
+        "sleep_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "stat_benchmark",
+    testonly = 1,
+    srcs = [
+        "stat_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_binary(
+    name = "unlink_benchmark",
+    testonly = 1,
+    srcs = [
+        "unlink_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
diff --git a/test/perf/linux/clock_getres_benchmark.cc b/test/perf/linux/clock_getres_benchmark.cc
new file mode 100644
index 000000000..b051293ad
--- /dev/null
+++ b/test/perf/linux/clock_getres_benchmark.cc
@@ -0,0 +1,39 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// clock_getres(1) is very nearly a no-op syscall, but it does require copying
+// out to a userspace struct. It thus provides a nice small copy-out benchmark.
+void BM_ClockGetRes(benchmark::State& state) {
+  struct timespec ts;
+  for (auto _ : state) {
+    clock_getres(CLOCK_MONOTONIC, &ts);
+  }
+}
+
+BENCHMARK(BM_ClockGetRes);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/clock_gettime_benchmark.cc b/test/perf/linux/clock_gettime_benchmark.cc
new file mode 100644
index 000000000..6691bebd9
--- /dev/null
+++ b/test/perf/linux/clock_gettime_benchmark.cc
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pthread.h>
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_ClockGettimeThreadCPUTime(benchmark::State& state) {
+  clockid_t clockid;
+  ASSERT_EQ(0, pthread_getcpuclockid(pthread_self(), &clockid));
+  struct timespec tp;
+
+  for (auto _ : state) {
+    clock_gettime(clockid, &tp);
+  }
+}
+
+BENCHMARK(BM_ClockGettimeThreadCPUTime);
+
+void BM_VDSOClockGettime(benchmark::State& state) {
+  const clockid_t clock = state.range(0);
+  struct timespec tp;
+  absl::Time start = absl::Now();
+
+  // Don't benchmark the calibration phase.
+  while (absl::Now() < start + absl::Milliseconds(2100)) {
+    clock_gettime(clock, &tp);
+  }
+
+  for (auto _ : state) {
+    clock_gettime(clock, &tp);
+  }
+}
+
+BENCHMARK(BM_VDSOClockGettime)->Arg(CLOCK_MONOTONIC)->Arg(CLOCK_REALTIME);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/death_benchmark.cc b/test/perf/linux/death_benchmark.cc
new file mode 100644
index 000000000..cb2b6fd07
--- /dev/null
+++ b/test/perf/linux/death_benchmark.cc
@@ -0,0 +1,36 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// DeathTest is not so much a microbenchmark as a macrobenchmark. It is testing
+// the ability of gVisor (on whatever platform) to execute all the related
+// stack-dumping routines associated with EXPECT_EXIT / EXPECT_DEATH.
+TEST(DeathTest, ZeroEqualsOne) {
+  EXPECT_EXIT({ TEST_CHECK(0 == 1); }, ::testing::KilledBySignal(SIGABRT), "");
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/epoll_benchmark.cc b/test/perf/linux/epoll_benchmark.cc
new file mode 100644
index 000000000..0b121338a
--- /dev/null
+++ b/test/perf/linux/epoll_benchmark.cc
@@ -0,0 +1,99 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+
+#include <atomic>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+#include "test/util/epoll_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Returns a new eventfd.
+PosixErrorOr<FileDescriptor> NewEventFD() {
+  int fd = eventfd(0, /* flags = */ 0);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(errno, "eventfd");
+  }
+  return FileDescriptor(fd);
+}
+
+// Also stolen from epoll.cc unit tests.
+void BM_EpollTimeout(benchmark::State& state) {
+  constexpr int kFDsPerEpoll = 3;
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(
+        RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN, 0));
+  }
+
+  struct epoll_event result[kFDsPerEpoll];
+  int timeout_ms = state.range(0);
+
+  for (auto _ : state) {
+    EXPECT_EQ(0, epoll_wait(epollfd.get(), result, kFDsPerEpoll, timeout_ms));
+  }
+}
+
+BENCHMARK(BM_EpollTimeout)->Range(0, 8);
+
+// Also stolen from epoll.cc unit tests.
+void BM_EpollAllEvents(benchmark::State& state) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  const int fds_per_epoll = state.range(0);
+  constexpr uint64_t kEventVal = 5;
+
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < fds_per_epoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(
+        RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN, 0));
+
+    ASSERT_THAT(WriteFd(eventfds[i].get(), &kEventVal, sizeof(kEventVal)),
+                SyscallSucceedsWithValue(sizeof(kEventVal)));
+  }
+
+  std::vector<struct epoll_event> result(fds_per_epoll);
+
+  for (auto _ : state) {
+    EXPECT_EQ(fds_per_epoll,
+              epoll_wait(epollfd.get(), result.data(), fds_per_epoll, 0));
+  }
+}
+
+BENCHMARK(BM_EpollAllEvents)->Range(2, 1024);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/fork_benchmark.cc b/test/perf/linux/fork_benchmark.cc
new file mode 100644
index 000000000..84fdbc8a0
--- /dev/null
+++ b/test/perf/linux/fork_benchmark.cc
@@ -0,0 +1,350 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/barrier.h"
+#include "benchmark/benchmark.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kBusyMax = 250;
+
+// Do some CPU-bound busy-work.
+int busy(int max) {
+  // Prevent the compiler from optimizing this work away,
+  volatile int count = 0;
+
+  for (int i = 1; i < max; i++) {
+    for (int j = 2; j < i / 2; j++) {
+      if (i % j == 0) {
+        count++;
+      }
+    }
+  }
+
+  return count;
+}
+
+void BM_CPUBoundUniprocess(benchmark::State& state) {
+  for (auto _ : state) {
+    busy(kBusyMax);
+  }
+}
+
+BENCHMARK(BM_CPUBoundUniprocess);
+
+void BM_CPUBoundAsymmetric(benchmark::State& state) {
+  const size_t max = state.max_iterations;
+  pid_t child = fork();
+  if (child == 0) {
+    for (int i = 0; i < max; i++) {
+      busy(kBusyMax);
+    }
+    _exit(0);
+  }
+  ASSERT_THAT(child, SyscallSucceeds());
+  ASSERT_TRUE(state.KeepRunningBatch(max));
+
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(0, WEXITSTATUS(status));
+  ASSERT_FALSE(state.KeepRunning());
+}
+
+BENCHMARK(BM_CPUBoundAsymmetric)->UseRealTime();
+
+void BM_CPUBoundSymmetric(benchmark::State& state) {
+  std::vector<pid_t> children;
+  auto child_cleanup = Cleanup([&] {
+    for (const pid_t child : children) {
+      int status;
+      EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+      EXPECT_TRUE(WIFEXITED(status));
+      EXPECT_EQ(0, WEXITSTATUS(status));
+    }
+    ASSERT_FALSE(state.KeepRunning());
+  });
+
+  const int processes = state.range(0);
+  for (int i = 0; i < processes; i++) {
+    size_t cur = (state.max_iterations + (processes - 1)) / processes;
+    if ((state.iterations() + cur) >= state.max_iterations) {
+      cur = state.max_iterations - state.iterations();
+    }
+    pid_t child = fork();
+    if (child == 0) {
+      for (int i = 0; i < cur; i++) {
+        busy(kBusyMax);
+      }
+      _exit(0);
+    }
+    ASSERT_THAT(child, SyscallSucceeds());
+    if (cur > 0) {
+      // We can have a zero cur here, depending.
+      ASSERT_TRUE(state.KeepRunningBatch(cur));
+    }
+    children.push_back(child);
+  }
+}
+
+BENCHMARK(BM_CPUBoundSymmetric)->Range(2, 16)->UseRealTime();
+
+// Child routine for ProcessSwitch/ThreadSwitch.
+// Reads from readfd and writes the result to writefd.
+void SwitchChild(int readfd, int writefd) {
+  while (1) {
+    char buf;
+    int ret = ReadFd(readfd, &buf, 1);
+    if (ret == 0) {
+      break;
+    }
+    TEST_CHECK_MSG(ret == 1, "read failed");
+
+    ret = WriteFd(writefd, &buf, 1);
+    if (ret == -1) {
+      TEST_CHECK_MSG(errno == EPIPE, "unexpected write failure");
+      break;
+    }
+    TEST_CHECK_MSG(ret == 1, "write failed");
+  }
+}
+
+// Send bytes in a loop through a series of pipes, each passing through a
+// different process.
+//
+//  Proc 0        Proc 1
+//    * ----------> *
+//    ^   Pipe 1    |
+//    |             |
+//    | Pipe 0      | Pipe 2
+//    |             |
+//    |             |
+//    |   Pipe 3    v
+//    * <---------- *
+//  Proc 3        Proc 2
+//
+// This exercises context switching through multiple processes.
+void BM_ProcessSwitch(benchmark::State& state) {
+  // Code below assumes there are at least two processes.
+  const int num_processes = state.range(0);
+  ASSERT_GE(num_processes, 2);
+
+  std::vector<pid_t> children;
+  auto child_cleanup = Cleanup([&] {
+    for (const pid_t child : children) {
+      int status;
+      EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+      EXPECT_TRUE(WIFEXITED(status));
+      EXPECT_EQ(0, WEXITSTATUS(status));
+    }
+  });
+
+  // Must come after children, as the FDs must be closed before the children
+  // will exit.
+  std::vector<FileDescriptor> read_fds;
+  std::vector<FileDescriptor> write_fds;
+
+  for (int i = 0; i < num_processes; i++) {
+    int fds[2];
+    ASSERT_THAT(pipe(fds), SyscallSucceeds());
+    read_fds.emplace_back(fds[0]);
+    write_fds.emplace_back(fds[1]);
+  }
+
+  // This process is one of the processes in the loop. It will be considered
+  // index 0.
+  for (int i = 1; i < num_processes; i++) {
+    // Read from current pipe index, write to next.
+    const int read_index = i;
+    const int read_fd = read_fds[read_index].get();
+
+    const int write_index = (i + 1) % num_processes;
+    const int write_fd = write_fds[write_index].get();
+
+    // std::vector isn't safe to use from the fork child.
+    FileDescriptor* read_array = read_fds.data();
+    FileDescriptor* write_array = write_fds.data();
+
+    pid_t child = fork();
+    if (!child) {
+      // Close all other FDs.
+      for (int j = 0; j < num_processes; j++) {
+        if (j != read_index) {
+          read_array[j].reset();
+        }
+        if (j != write_index) {
+          write_array[j].reset();
+        }
+      }
+
+      SwitchChild(read_fd, write_fd);
+      _exit(0);
+    }
+    ASSERT_THAT(child, SyscallSucceeds());
+    children.push_back(child);
+  }
+
+  // Read from current pipe index (0), write to next (1).
+  const int read_index = 0;
+  const int read_fd = read_fds[read_index].get();
+
+  const int write_index = 1;
+  const int write_fd = write_fds[write_index].get();
+
+  // Kick start the loop.
+  char buf = 'a';
+  ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+
+  for (auto _ : state) {
+    ASSERT_THAT(ReadFd(read_fd, &buf, 1), SyscallSucceedsWithValue(1));
+    ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+  }
+}
+
+BENCHMARK(BM_ProcessSwitch)->Range(2, 16)->UseRealTime();
+
+// Equivalent to BM_ThreadSwitch using threads instead of processes.
+void BM_ThreadSwitch(benchmark::State& state) {
+  // Code below assumes there are at least two threads.
+  const int num_threads = state.range(0);
+  ASSERT_GE(num_threads, 2);
+
+  // Must come after threads, as the FDs must be closed before the children
+  // will exit.
+  std::vector<std::unique_ptr<ScopedThread>> threads;
+  std::vector<FileDescriptor> read_fds;
+  std::vector<FileDescriptor> write_fds;
+
+  for (int i = 0; i < num_threads; i++) {
+    int fds[2];
+    ASSERT_THAT(pipe(fds), SyscallSucceeds());
+    read_fds.emplace_back(fds[0]);
+    write_fds.emplace_back(fds[1]);
+  }
+
+  // This thread is one of the threads in the loop. It will be considered
+  // index 0.
+  for (int i = 1; i < num_threads; i++) {
+    // Read from current pipe index, write to next.
+    //
+    // Transfer ownership of the FDs to the thread.
+    const int read_index = i;
+    const int read_fd = read_fds[read_index].release();
+
+    const int write_index = (i + 1) % num_threads;
+    const int write_fd = write_fds[write_index].release();
+
+    threads.emplace_back(std::make_unique<ScopedThread>([read_fd, write_fd] {
+      FileDescriptor read(read_fd);
+      FileDescriptor write(write_fd);
+      SwitchChild(read.get(), write.get());
+    }));
+  }
+
+  // Read from current pipe index (0), write to next (1).
+  const int read_index = 0;
+  const int read_fd = read_fds[read_index].get();
+
+  const int write_index = 1;
+  const int write_fd = write_fds[write_index].get();
+
+  // Kick start the loop.
+  char buf = 'a';
+  ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+
+  for (auto _ : state) {
+    ASSERT_THAT(ReadFd(read_fd, &buf, 1), SyscallSucceedsWithValue(1));
+    ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+  }
+
+  // The two FDs still owned by this thread are closed, causing the next thread
+  // to exit its loop and close its FDs, and so on until all threads exit.
+}
+
+BENCHMARK(BM_ThreadSwitch)->Range(2, 16)->UseRealTime();
+
+void BM_ThreadStart(benchmark::State& state) {
+  const int num_threads = state.range(0);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    auto barrier = new absl::Barrier(num_threads + 1);
+    std::vector<std::unique_ptr<ScopedThread>> threads;
+
+    state.ResumeTiming();
+
+    for (size_t i = 0; i < num_threads; ++i) {
+      threads.emplace_back(std::make_unique<ScopedThread>([barrier] {
+        if (barrier->Block()) {
+          delete barrier;
+        }
+      }));
+    }
+
+    if (barrier->Block()) {
+      delete barrier;
+    }
+
+    state.PauseTiming();
+
+    for (const auto& thread : threads) {
+      thread->Join();
+    }
+
+    state.ResumeTiming();
+  }
+}
+
+BENCHMARK(BM_ThreadStart)->Range(1, 2048)->UseRealTime();
+
+// Benchmark the complete fork + exit + wait.
+void BM_ProcessLifecycle(benchmark::State& state) {
+  const int num_procs = state.range(0);
+
+  std::vector<pid_t> pids(num_procs);
+  for (auto _ : state) {
+    for (size_t i = 0; i < num_procs; ++i) {
+      int pid = fork();
+      if (pid == 0) {
+        _exit(0);
+      }
+      ASSERT_THAT(pid, SyscallSucceeds());
+      pids[i] = pid;
+    }
+
+    for (const int pid : pids) {
+      ASSERT_THAT(RetryEINTR(waitpid)(pid, nullptr, 0),
+                  SyscallSucceedsWithValue(pid));
+    }
+  }
+}
+
+BENCHMARK(BM_ProcessLifecycle)->Range(1, 512)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/futex_benchmark.cc b/test/perf/linux/futex_benchmark.cc
new file mode 100644
index 000000000..b349d50bf
--- /dev/null
+++ b/test/perf/linux/futex_benchmark.cc
@@ -0,0 +1,248 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/futex.h>
+
+#include <atomic>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+inline int FutexWait(std::atomic<int32_t>* v, int32_t val) {
+  return syscall(SYS_futex, v, FUTEX_BITSET_MATCH_ANY, nullptr);
+}
+
+inline int FutexWaitRelativeTimeout(std::atomic<int32_t>* v, int32_t val,
+                                    const struct timespec* reltime) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_PRIVATE, reltime);
+}
+
+inline int FutexWaitAbsoluteTimeout(std::atomic<int32_t>* v, int32_t val,
+                                    const struct timespec* abstime) {
+  return syscall(SYS_futex, v, FUTEX_BITSET_MATCH_ANY, abstime);
+}
+
+inline int FutexWaitBitsetAbsoluteTimeout(std::atomic<int32_t>* v, int32_t val,
+                                          int32_t bits,
+                                          const struct timespec* abstime) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_BITSET_PRIVATE | FUTEX_CLOCK_REALTIME,
+                 val, abstime, nullptr, bits);
+}
+
+inline int FutexWake(std::atomic<int32_t>* v, int32_t count) {
+  return syscall(SYS_futex, v, FUTEX_WAKE_PRIVATE, count);
+}
+
+// This just uses FUTEX_WAKE on an address with nothing waiting, very simple.
+void BM_FutexWakeNop(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+
+  for (auto _ : state) {
+    EXPECT_EQ(0, FutexWake(&v, 1));
+  }
+}
+
+BENCHMARK(BM_FutexWakeNop);
+
+// This just uses FUTEX_WAIT on an address whose value has changed, i.e., the
+// syscall won't wait.
+void BM_FutexWaitNop(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+
+  for (auto _ : state) {
+    EXPECT_EQ(-EAGAIN, FutexWait(&v, 1));
+  }
+}
+
+BENCHMARK(BM_FutexWaitNop);
+
+// This uses FUTEX_WAIT with a timeout on an address whose value never
+// changes, such that it always times out. Timeout overhead can be estimated by
+// timer overruns for short timeouts.
+void BM_FutexWaitTimeout(benchmark::State& state) {
+  const int timeout_ns = state.range(0);
+  std::atomic<int32_t> v(0);
+  auto ts = absl::ToTimespec(absl::Nanoseconds(timeout_ns));
+
+  for (auto _ : state) {
+    EXPECT_EQ(-ETIMEDOUT, FutexWaitRelativeTimeout(&v, 0, &ts));
+  }
+}
+
+BENCHMARK(BM_FutexWaitTimeout)
+    ->Arg(1)
+    ->Arg(10)
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000);
+
+// This calls FUTEX_WAIT_BITSET with CLOCK_REALTIME.
+void BM_FutexWaitBitset(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+  int timeout_ns = state.range(0);
+  auto ts = absl::ToTimespec(absl::Nanoseconds(timeout_ns));
+  for (auto _ : state) {
+    EXPECT_EQ(-ETIMEDOUT, FutexWaitBitsetAbsoluteTimeout(&v, 0, 1, &ts));
+  }
+}
+
+BENCHMARK(BM_FutexWaitBitset)->Range(0, 100000);
+
+int64_t GetCurrentMonotonicTimeNanos() {
+  struct timespec ts;
+  TEST_CHECK(clock_gettime(CLOCK_MONOTONIC, &ts) != -1);
+  return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+void SpinNanos(int64_t delay_ns) {
+  if (delay_ns <= 0) {
+    return;
+  }
+  const int64_t end = GetCurrentMonotonicTimeNanos() + delay_ns;
+  while (GetCurrentMonotonicTimeNanos() < end) {
+    // spin
+  }
+}
+
+// Each iteration of FutexRoundtripDelayed involves a thread sending a futex
+// wakeup to another thread, which spins for delay_us and then sends a futex
+// wakeup back. The time per iteration is 2*  (delay_us + kBeforeWakeDelayNs +
+// futex/scheduling overhead).
+void BM_FutexRoundtripDelayed(benchmark::State& state) {
+  const int delay_us = state.range(0);
+
+  const int64_t delay_ns = delay_us * 1000;
+  // Spin for an extra kBeforeWakeDelayNs before invoking FUTEX_WAKE to reduce
+  // the probability that the wakeup comes before the wait, preventing the wait
+  // from ever taking effect and causing the benchmark to underestimate the
+  // actual wakeup time.
+  constexpr int64_t kBeforeWakeDelayNs = 500;
+  std::atomic<int32_t> v(0);
+  ScopedThread t([&] {
+    for (int i = 0; i < state.max_iterations; i++) {
+      SpinNanos(delay_ns);
+      while (v.load(std::memory_order_acquire) == 0) {
+        FutexWait(&v, 0);
+      }
+      SpinNanos(kBeforeWakeDelayNs + delay_ns);
+      v.store(0, std::memory_order_release);
+      FutexWake(&v, 1);
+    }
+  });
+  for (auto _ : state) {
+    SpinNanos(kBeforeWakeDelayNs + delay_ns);
+    v.store(1, std::memory_order_release);
+    FutexWake(&v, 1);
+    SpinNanos(delay_ns);
+    while (v.load(std::memory_order_acquire) == 1) {
+      FutexWait(&v, 1);
+    }
+  }
+}
+
+BENCHMARK(BM_FutexRoundtripDelayed)
+    ->Arg(0)
+    ->Arg(10)
+    ->Arg(20)
+    ->Arg(50)
+    ->Arg(100);
+
+// FutexLock is a simple, dumb futex based lock implementation.
+// It will try to acquire the lock by atomically incrementing the
+// lock word. If it did not increment the lock from 0 to 1, someone
+// else has the lock, so it will FUTEX_WAIT until it is woken in
+// the unlock path.
+class FutexLock {
+ public:
+  FutexLock() : lock_word_(0) {}
+
+  void lock(struct timespec* deadline) {
+    int32_t val;
+    while ((val = lock_word_.fetch_add(1, std::memory_order_acquire) + 1) !=
+           1) {
+      // If we didn't get the lock by incrementing from 0 to 1,
+      // do a FUTEX_WAIT with the desired current value set to
+      // val. If val is no longer what the atomic increment returned,
+      // someone might have set it to 0 so we can try to acquire
+      // again.
+      int ret = FutexWaitAbsoluteTimeout(&lock_word_, val, deadline);
+      if (ret == 0 || ret == -EWOULDBLOCK || ret == -EINTR) {
+        continue;
+      } else {
+        FAIL() << "unexpected FUTEX_WAIT return: " << ret;
+      }
+    }
+  }
+
+  void unlock() {
+    // Store 0 into the lock word and wake one waiter. We intentionally
+    // ignore the return value of the FUTEX_WAKE here, since there may be
+    // no waiters to wake anyway.
+    lock_word_.store(0, std::memory_order_release);
+    (void)FutexWake(&lock_word_, 1);
+  }
+
+ private:
+  std::atomic<int32_t> lock_word_;
+};
+
+FutexLock* test_lock;  // Used below.
+
+void FutexContend(benchmark::State& state, int thread_index,
+                  struct timespec* deadline) {
+  int counter = 0;
+  if (thread_index == 0) {
+    test_lock = new FutexLock();
+  }
+  for (auto _ : state) {
+    test_lock->lock(deadline);
+    counter++;
+    test_lock->unlock();
+  }
+  if (thread_index == 0) {
+    delete test_lock;
+  }
+  state.SetItemsProcessed(state.iterations());
+}
+
+void BM_FutexContend(benchmark::State& state) {
+  FutexContend(state, state.thread_index, nullptr);
+}
+
+BENCHMARK(BM_FutexContend)->ThreadRange(1, 1024)->UseRealTime();
+
+void BM_FutexDeadlineContend(benchmark::State& state) {
+  auto deadline = absl::ToTimespec(absl::Now() + absl::Minutes(10));
+  FutexContend(state, state.thread_index, &deadline);
+}
+
+BENCHMARK(BM_FutexDeadlineContend)->ThreadRange(1, 1024)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/getdents_benchmark.cc b/test/perf/linux/getdents_benchmark.cc
new file mode 100644
index 000000000..0e03975b4
--- /dev/null
+++ b/test/perf/linux/getdents_benchmark.cc
@@ -0,0 +1,149 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+#ifndef SYS_getdents64
+#if defined(__x86_64__)
+#define SYS_getdents64 217
+#elif defined(__aarch64__)
+#define SYS_getdents64 217
+#else
+#error "Unknown architecture"
+#endif
+#endif  // SYS_getdents64
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kBufferSize = 16384;
+
+PosixErrorOr<TempPath> CreateDirectory(int count,
+                                       std::vector<std::string>* files) {
+  ASSIGN_OR_RETURN_ERRNO(TempPath dir, TempPath::CreateDir());
+
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor dfd,
+                         Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+  for (int i = 0; i < count; i++) {
+    auto file = NewTempRelPath();
+    auto res = MknodAt(dfd, file, S_IFREG | 0644, 0);
+    RETURN_IF_ERRNO(res);
+    files->push_back(file);
+  }
+
+  return std::move(dir);
+}
+
+PosixError CleanupDirectory(const TempPath& dir,
+                            std::vector<std::string>* files) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor dfd,
+                         Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+  for (auto it = files->begin(); it != files->end(); ++it) {
+    auto res = UnlinkAt(dfd, *it, 0);
+    RETURN_IF_ERRNO(res);
+  }
+  return NoError();
+}
+
+// Creates a directory containing `files` files, and reads all the directory
+// entries from the directory using a single FD.
+void BM_GetdentsSameFD(benchmark::State& state) {
+  // Create directory with given files.
+  const int count = state.range(0);
+
+  // Keep a vector of all of the file TempPaths that is destroyed before dir.
+  //
+  // Normally, we'd simply allow dir to recursively clean up the contained
+  // files, but that recursive cleanup uses getdents, which may be very slow in
+  // extreme benchmarks.
+  TempPath dir;
+  std::vector<std::string> files;
+  dir = ASSERT_NO_ERRNO_AND_VALUE(CreateDirectory(count, &files));
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+  char buffer[kBufferSize];
+
+  // We read all directory entries on each iteration, but report this as a
+  // "batch" iteration so that reported times are per file.
+  while (state.KeepRunningBatch(count)) {
+    ASSERT_THAT(lseek(fd.get(), 0, SEEK_SET), SyscallSucceeds());
+
+    int ret;
+    do {
+      ASSERT_THAT(ret = syscall(SYS_getdents64, fd.get(), buffer, kBufferSize),
+                  SyscallSucceeds());
+    } while (ret > 0);
+  }
+
+  ASSERT_NO_ERRNO(CleanupDirectory(dir, &files));
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_GetdentsSameFD)->Range(1, 1 << 16)->UseRealTime();
+
+// Creates a directory containing `files` files, and reads all the directory
+// entries from the directory using a new FD each time.
+void BM_GetdentsNewFD(benchmark::State& state) {
+  // Create directory with given files.
+  const int count = state.range(0);
+
+  // Keep a vector of all of the file TempPaths that is destroyed before dir.
+  //
+  // Normally, we'd simply allow dir to recursively clean up the contained
+  // files, but that recursive cleanup uses getdents, which may be very slow in
+  // extreme benchmarks.
+  TempPath dir;
+  std::vector<std::string> files;
+  dir = ASSERT_NO_ERRNO_AND_VALUE(CreateDirectory(count, &files));
+  char buffer[kBufferSize];
+
+  // We read all directory entries on each iteration, but report this as a
+  // "batch" iteration so that reported times are per file.
+  while (state.KeepRunningBatch(count)) {
+    FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+    int ret;
+    do {
+      ASSERT_THAT(ret = syscall(SYS_getdents64, fd.get(), buffer, kBufferSize),
+                  SyscallSucceeds());
+    } while (ret > 0);
+  }
+
+  ASSERT_NO_ERRNO(CleanupDirectory(dir, &files));
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_GetdentsNewFD)->Range(1, 1 << 16)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/getpid_benchmark.cc b/test/perf/linux/getpid_benchmark.cc
new file mode 100644
index 000000000..db74cb264
--- /dev/null
+++ b/test/perf/linux/getpid_benchmark.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Getpid(benchmark::State& state) {
+  for (auto _ : state) {
+    syscall(SYS_getpid);
+  }
+}
+
+BENCHMARK(BM_Getpid);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/gettid_benchmark.cc b/test/perf/linux/gettid_benchmark.cc
new file mode 100644
index 000000000..8f4961f5e
--- /dev/null
+++ b/test/perf/linux/gettid_benchmark.cc
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Gettid(benchmark::State& state) {
+  for (auto _ : state) {
+    syscall(SYS_gettid);
+  }
+}
+
+BENCHMARK(BM_Gettid)->ThreadRange(1, 4000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/mapping_benchmark.cc b/test/perf/linux/mapping_benchmark.cc
new file mode 100644
index 000000000..39c30fe69
--- /dev/null
+++ b/test/perf/linux/mapping_benchmark.cc
@@ -0,0 +1,163 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Conservative value for /proc/sys/vm/max_map_count, which limits the number of
+// VMAs, minus a safety margin for VMAs that already exist for the test binary.
+// The default value for max_map_count is
+// include/linux/mm.h:DEFAULT_MAX_MAP_COUNT = 65530.
+constexpr size_t kMaxVMAs = 64001;
+
+// Map then unmap pages without touching them.
+void BM_MapUnmap(benchmark::State& state) {
+  // Number of pages to map.
+  const int pages = state.range(0);
+
+  while (state.KeepRunning()) {
+    void* addr = mmap(0, pages * kPageSize, PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+    int ret = munmap(addr, pages * kPageSize);
+    TEST_CHECK_MSG(ret == 0, "munmap failed");
+  }
+}
+
+BENCHMARK(BM_MapUnmap)->Range(1, 1 << 17)->UseRealTime();
+
+// Map, touch, then unmap pages.
+void BM_MapTouchUnmap(benchmark::State& state) {
+  // Number of pages to map.
+  const int pages = state.range(0);
+
+  while (state.KeepRunning()) {
+    void* addr = mmap(0, pages * kPageSize, PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+    char* c = reinterpret_cast<char*>(addr);
+    char* end = c + pages * kPageSize;
+    while (c < end) {
+      *c = 42;
+      c += kPageSize;
+    }
+
+    int ret = munmap(addr, pages * kPageSize);
+    TEST_CHECK_MSG(ret == 0, "munmap failed");
+  }
+}
+
+BENCHMARK(BM_MapTouchUnmap)->Range(1, 1 << 17)->UseRealTime();
+
+// Map and touch many pages, unmapping all at once.
+//
+// NOTE(b/111429208): This is a regression test to ensure performant mapping and
+// allocation even with tons of mappings.
+void BM_MapTouchMany(benchmark::State& state) {
+  // Number of pages to map.
+  const int page_count = state.range(0);
+
+  while (state.KeepRunning()) {
+    std::vector<void*> pages;
+
+    for (int i = 0; i < page_count; i++) {
+      void* addr = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE,
+                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+      char* c = reinterpret_cast<char*>(addr);
+      *c = 42;
+
+      pages.push_back(addr);
+    }
+
+    for (void* addr : pages) {
+      int ret = munmap(addr, kPageSize);
+      TEST_CHECK_MSG(ret == 0, "munmap failed");
+    }
+  }
+
+  state.SetBytesProcessed(kPageSize * page_count * state.iterations());
+}
+
+BENCHMARK(BM_MapTouchMany)->Range(1, 1 << 12)->UseRealTime();
+
+void BM_PageFault(benchmark::State& state) {
+  // Map the region in which we will take page faults. To ensure that each page
+  // fault maps only a single page, each page we touch must correspond to a
+  // distinct VMA. Thus we need a 1-page gap between each 1-page VMA. However,
+  // each gap consists of a PROT_NONE VMA, instead of an unmapped hole, so that
+  // if there are background threads running, they can't inadvertently creating
+  // mappings in our gaps that are unmapped when the test ends.
+  size_t test_pages = kMaxVMAs;
+  // Ensure that test_pages is odd, since we want the test region to both
+  // begin and end with a mapped page.
+  if (test_pages % 2 == 0) {
+    test_pages--;
+  }
+  const size_t test_region_bytes = test_pages * kPageSize;
+  // Use MAP_SHARED here because madvise(MADV_DONTNEED) on private mappings on
+  // gVisor won't force future sentry page faults (by design). Use MAP_POPULATE
+  // so that Linux pre-allocates the shmem file used to back the mapping.
+  Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(test_region_bytes, PROT_READ, MAP_SHARED | MAP_POPULATE));
+  for (size_t i = 0; i < test_pages / 2; i++) {
+    ASSERT_THAT(
+        mprotect(reinterpret_cast<void*>(m.addr() + ((2 * i + 1) * kPageSize)),
+                 kPageSize, PROT_NONE),
+        SyscallSucceeds());
+  }
+
+  const size_t mapped_pages = test_pages / 2 + 1;
+  // "Start" at the end of the mapped region to force the mapped region to be
+  // reset, since we mapped it with MAP_POPULATE.
+  size_t cur_page = mapped_pages;
+  for (auto _ : state) {
+    if (cur_page >= mapped_pages) {
+      // We've reached the end of our mapped region and have to reset it to
+      // incur page faults again.
+      state.PauseTiming();
+      ASSERT_THAT(madvise(m.ptr(), test_region_bytes, MADV_DONTNEED),
+                  SyscallSucceeds());
+      cur_page = 0;
+      state.ResumeTiming();
+    }
+    const uintptr_t addr = m.addr() + (2 * cur_page * kPageSize);
+    const char c = *reinterpret_cast<volatile char*>(addr);
+    benchmark::DoNotOptimize(c);
+    cur_page++;
+  }
+}
+
+BENCHMARK(BM_PageFault)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/open_benchmark.cc b/test/perf/linux/open_benchmark.cc
new file mode 100644
index 000000000..68008f6d5
--- /dev/null
+++ b/test/perf/linux/open_benchmark.cc
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Open(benchmark::State& state) {
+  const int size = state.range(0);
+  std::vector<TempPath> cache;
+  for (int i = 0; i < size; i++) {
+    auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+    cache.emplace_back(std::move(path));
+  }
+
+  unsigned int seed = 1;
+  for (auto _ : state) {
+    const int chosen = rand_r(&seed) % size;
+    int fd = open(cache[chosen].path().c_str(), O_RDONLY);
+    TEST_CHECK(fd != -1);
+    close(fd);
+  }
+}
+
+BENCHMARK(BM_Open)->Range(1, 128)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/pipe_benchmark.cc b/test/perf/linux/pipe_benchmark.cc
new file mode 100644
index 000000000..8f5f6a2a3
--- /dev/null
+++ b/test/perf/linux/pipe_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cerrno>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Pipe(benchmark::State& state) {
+  int fds[2];
+  TEST_CHECK(pipe(fds) == 0);
+
+  const int size = state.range(0);
+  std::vector<char> wbuf(size);
+  std::vector<char> rbuf(size);
+  RandomizeBuffer(wbuf.data(), size);
+
+  ScopedThread t([&] {
+    auto const fd = fds[1];
+    for (int i = 0; i < state.max_iterations; i++) {
+      TEST_CHECK(WriteFd(fd, wbuf.data(), wbuf.size()) == size);
+    }
+  });
+
+  for (auto _ : state) {
+    TEST_CHECK(ReadFd(fds[0], rbuf.data(), rbuf.size()) == size);
+  }
+
+  t.Join();
+
+  close(fds[0]);
+  close(fds[1]);
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Pipe)->Range(1, 1 << 20)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/randread_benchmark.cc b/test/perf/linux/randread_benchmark.cc
new file mode 100644
index 000000000..b0eb8c24e
--- /dev/null
+++ b/test/perf/linux/randread_benchmark.cc
@@ -0,0 +1,100 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Create a 1GB file that will be read from at random positions. This should
+// invalid any performance gains from caching.
+const uint64_t kFileSize = 1ULL << 30;
+
+// How many bytes to write at once to initialize the file used to read from.
+const uint32_t kWriteSize = 65536;
+
+// Largest benchmarked read unit.
+const uint32_t kMaxRead = 1UL << 26;
+
+TempPath CreateFile(uint64_t file_size) {
+  auto path = TempPath::CreateFile().ValueOrDie();
+  FileDescriptor fd = Open(path.path(), O_WRONLY).ValueOrDie();
+
+  // Try to minimize syscalls by using maximum size writev() requests.
+  std::vector<char> buffer(kWriteSize);
+  RandomizeBuffer(buffer.data(), buffer.size());
+  const std::vector<std::vector<struct iovec>> iovecs_list =
+      GenerateIovecs(file_size, buffer.data(), buffer.size());
+  for (const auto& iovecs : iovecs_list) {
+    TEST_CHECK(writev(fd.get(), iovecs.data(), iovecs.size()) >= 0);
+  }
+
+  return path;
+}
+
+// Global test state, initialized once per process lifetime.
+struct GlobalState {
+  const TempPath tmpfile;
+  explicit GlobalState(TempPath tfile) : tmpfile(std::move(tfile)) {}
+};
+
+GlobalState& GetGlobalState() {
+  // This gets created only once throughout the lifetime of the process.
+  // Use a dynamically allocated object (that is never deleted) to avoid order
+  // of destruction of static storage variables issues.
+  static GlobalState* const state =
+      // The actual file size is the maximum random seek range (kFileSize) + the
+      // maximum read size so we can read that number of bytes at the end of the
+      // file.
+      new GlobalState(CreateFile(kFileSize + kMaxRead));
+  return *state;
+}
+
+void BM_RandRead(benchmark::State& state) {
+  const int size = state.range(0);
+
+  GlobalState& global_state = GetGlobalState();
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(global_state.tmpfile.path(), O_RDONLY));
+  std::vector<char> buf(size);
+
+  unsigned int seed = 1;
+  for (auto _ : state) {
+    TEST_CHECK(PreadFd(fd.get(), buf.data(), buf.size(),
+                       rand_r(&seed) % kFileSize) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_RandRead)->Range(1, kMaxRead)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/read_benchmark.cc b/test/perf/linux/read_benchmark.cc
new file mode 100644
index 000000000..62445867d
--- /dev/null
+++ b/test/perf/linux/read_benchmark.cc
@@ -0,0 +1,53 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Read(benchmark::State& state) {
+  const int size = state.range(0);
+  const std::string contents(size, 0);
+  auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), contents, TempPath::kDefaultFileMode));
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDONLY));
+
+  std::vector<char> buf(size);
+  for (auto _ : state) {
+    TEST_CHECK(PreadFd(fd.get(), buf.data(), buf.size(), 0) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Read)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/sched_yield_benchmark.cc b/test/perf/linux/sched_yield_benchmark.cc
new file mode 100644
index 000000000..6756b5575
--- /dev/null
+++ b/test/perf/linux/sched_yield_benchmark.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Sched_yield(benchmark::State& state) {
+  for (auto ignored : state) {
+    TEST_CHECK(sched_yield() == 0);
+  }
+}
+
+BENCHMARK(BM_Sched_yield)->ThreadRange(1, 2000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/send_recv_benchmark.cc b/test/perf/linux/send_recv_benchmark.cc
new file mode 100644
index 000000000..d73e49523
--- /dev/null
+++ b/test/perf/linux/send_recv_benchmark.cc
@@ -0,0 +1,372 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include <cstring>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/notification.h"
+#include "benchmark/benchmark.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr ssize_t kMessageSize = 1024;
+
+class Message {
+ public:
+  explicit Message(int byte = 0) : Message(byte, kMessageSize, 0) {}
+
+  explicit Message(int byte, int sz) : Message(byte, sz, 0) {}
+
+  explicit Message(int byte, int sz, int cmsg_sz)
+      : buffer_(sz, byte), cmsg_buffer_(cmsg_sz, 0) {
+    iov_.iov_base = buffer_.data();
+    iov_.iov_len = sz;
+    hdr_.msg_iov = &iov_;
+    hdr_.msg_iovlen = 1;
+    hdr_.msg_control = cmsg_buffer_.data();
+    hdr_.msg_controllen = cmsg_sz;
+  }
+
+  struct msghdr* header() {
+    return &hdr_;
+  }
+
+ private:
+  std::vector<char> buffer_;
+  std::vector<char> cmsg_buffer_;
+  struct iovec iov_ = {};
+  struct msghdr hdr_ = {};
+};
+
+void BM_Recvmsg(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  Message send_msg('a'), recv_msg;
+
+  ScopedThread t([&send_msg, &send_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendmsg(send_socket.get(), send_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_Recvmsg)->UseRealTime();
+
+void BM_Sendmsg(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  Message send_msg('a'), recv_msg;
+
+  ScopedThread t([&recv_msg, &recv_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  for (auto ignored : state) {
+    int n = sendmsg(send_socket.get(), send_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_sent += n;
+  }
+
+  notification.Notify();
+  send_socket.reset();
+
+  state.SetBytesProcessed(bytes_sent);
+}
+
+BENCHMARK(BM_Sendmsg)->UseRealTime();
+
+void BM_Recvfrom(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  char send_buffer[kMessageSize], recv_buffer[kMessageSize];
+
+  ScopedThread t([&send_socket, &send_buffer, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendto(send_socket.get(), send_buffer, kMessageSize, 0, nullptr, 0);
+    }
+  });
+
+  int bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvfrom(recv_socket.get(), recv_buffer, kMessageSize, 0, nullptr,
+                     nullptr);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_Recvfrom)->UseRealTime();
+
+void BM_Sendto(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  char send_buffer[kMessageSize], recv_buffer[kMessageSize];
+
+  ScopedThread t([&recv_socket, &recv_buffer, &notification] {
+    while (!notification.HasBeenNotified()) {
+      recvfrom(recv_socket.get(), recv_buffer, kMessageSize, 0, nullptr,
+               nullptr);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  for (auto ignored : state) {
+    int n = sendto(send_socket.get(), send_buffer, kMessageSize, 0, nullptr, 0);
+    TEST_CHECK(n > 0);
+    bytes_sent += n;
+  }
+
+  notification.Notify();
+  send_socket.reset();
+
+  state.SetBytesProcessed(bytes_sent);
+}
+
+BENCHMARK(BM_Sendto)->UseRealTime();
+
+PosixErrorOr<sockaddr_storage> InetLoopbackAddr(int family) {
+  struct sockaddr_storage addr;
+  memset(&addr, 0, sizeof(addr));
+  addr.ss_family = family;
+  switch (family) {
+    case AF_INET:
+      reinterpret_cast<struct sockaddr_in*>(&addr)->sin_addr.s_addr =
+          htonl(INADDR_LOOPBACK);
+      break;
+    case AF_INET6:
+      reinterpret_cast<struct sockaddr_in6*>(&addr)->sin6_addr =
+          in6addr_loopback;
+      break;
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+  return addr;
+}
+
+// BM_RecvmsgWithControlBuf measures the performance of recvmsg when we allocate
+// space for control messages. Note that we do not expect to receive any.
+void BM_RecvmsgWithControlBuf(benchmark::State& state) {
+  auto listen_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr = ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(AF_INET6));
+  socklen_t addrlen = sizeof(addr);
+
+  // Bind to some port then start listening.
+  ASSERT_THAT(bind(listen_socket.get(),
+                   reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(listen_socket.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the address we're listening on, then connect to it. We need to do this
+  // because we're allowing the stack to pick a port for us.
+  ASSERT_THAT(getsockname(listen_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  auto send_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(
+      RetryEINTR(connect)(send_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+      SyscallSucceeds());
+
+  // Accept the connection.
+  auto recv_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_socket.get(), nullptr, nullptr));
+
+  absl::Notification notification;
+  Message send_msg('a');
+  // Create a msghdr with a buffer allocated for control messages.
+  Message recv_msg(0, kMessageSize, /*cmsg_sz=*/24);
+
+  ScopedThread t([&send_msg, &send_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendmsg(send_socket.get(), send_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_RecvmsgWithControlBuf)->UseRealTime();
+
+// BM_SendmsgTCP measures the sendmsg throughput with varying payload sizes.
+//
+// state.Args[0] indicates whether the underlying socket should be blocking or
+// non-blocking w/ 0 indicating non-blocking and 1 to indicate blocking.
+// state.Args[1] is the size of the payload to be used per sendmsg call.
+void BM_SendmsgTCP(benchmark::State& state) {
+  auto listen_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr = ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(AF_INET));
+  socklen_t addrlen = sizeof(addr);
+
+  // Bind to some port then start listening.
+  ASSERT_THAT(bind(listen_socket.get(),
+                   reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(listen_socket.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the address we're listening on, then connect to it. We need to do this
+  // because we're allowing the stack to pick a port for us.
+  ASSERT_THAT(getsockname(listen_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  auto send_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(
+      RetryEINTR(connect)(send_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+      SyscallSucceeds());
+
+  // Accept the connection.
+  auto recv_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_socket.get(), nullptr, nullptr));
+
+  // Check if we want to run the test w/ a blocking send socket
+  // or non-blocking.
+  const int blocking = state.range(0);
+  if (!blocking) {
+    // Set the send FD to O_NONBLOCK.
+    int opts;
+    ASSERT_THAT(opts = fcntl(send_socket.get(), F_GETFL), SyscallSucceeds());
+    opts |= O_NONBLOCK;
+    ASSERT_THAT(fcntl(send_socket.get(), F_SETFL, opts), SyscallSucceeds());
+  }
+
+  absl::Notification notification;
+
+  // Get the buffer size we should use for this iteration of the test.
+  const int buf_size = state.range(1);
+  Message send_msg('a', buf_size), recv_msg(0, buf_size);
+
+  ScopedThread t([&recv_msg, &recv_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      TEST_CHECK(recvmsg(recv_socket.get(), recv_msg.header(), 0) >= 0);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  int ncalls = 0;
+  for (auto ignored : state) {
+    int sent = 0;
+    while (true) {
+      struct msghdr hdr = {};
+      struct iovec iov = {};
+      struct msghdr* snd_header = send_msg.header();
+      iov.iov_base = static_cast<char*>(snd_header->msg_iov->iov_base) + sent;
+      iov.iov_len = snd_header->msg_iov->iov_len - sent;
+      hdr.msg_iov = &iov;
+      hdr.msg_iovlen = 1;
+      int n = RetryEINTR(sendmsg)(send_socket.get(), &hdr, 0);
+      ncalls++;
+      if (n > 0) {
+        sent += n;
+        if (sent == buf_size) {
+          break;
+        }
+        // n can be > 0 but less than requested size. In which case we don't
+        // poll.
+        continue;
+      }
+      // Poll the fd for it to become writable.
+      struct pollfd poll_fd = {send_socket.get(), POLL_OUT, 0};
+      EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10),
+                  SyscallSucceedsWithValue(0));
+    }
+    bytes_sent += static_cast<int64_t>(sent);
+  }
+
+  notification.Notify();
+  send_socket.reset();
+  state.SetBytesProcessed(bytes_sent);
+}
+
+void Args(benchmark::internal::Benchmark* benchmark) {
+  for (int blocking = 0; blocking < 2; blocking++) {
+    for (int buf_size = 1024; buf_size <= 256 << 20; buf_size *= 2) {
+      benchmark->Args({blocking, buf_size});
+    }
+  }
+}
+
+BENCHMARK(BM_SendmsgTCP)->Apply(&Args)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/seqwrite_benchmark.cc b/test/perf/linux/seqwrite_benchmark.cc
new file mode 100644
index 000000000..af49e4477
--- /dev/null
+++ b/test/perf/linux/seqwrite_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// The maximum file size of the test file, when writes get beyond this point
+// they wrap around. This should be large enough to blow away caches.
+const uint64_t kMaxFile = 1 << 30;
+
+// Perform writes of various sizes sequentially to one file. Wraps around if it
+// goes above a certain maximum file size.
+void BM_SeqWrite(benchmark::State& state) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_WRONLY));
+
+  const int size = state.range(0);
+  std::vector<char> buf(size);
+  RandomizeBuffer(buf.data(), buf.size());
+
+  // Start writes at offset 0.
+  uint64_t offset = 0;
+  for (auto _ : state) {
+    TEST_CHECK(PwriteFd(fd.get(), buf.data(), buf.size(), offset) ==
+               buf.size());
+    offset += buf.size();
+    // Wrap around if going above the maximum file size.
+    if (offset >= kMaxFile) {
+      offset = 0;
+    }
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_SeqWrite)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/signal_benchmark.cc b/test/perf/linux/signal_benchmark.cc
new file mode 100644
index 000000000..a6928df58
--- /dev/null
+++ b/test/perf/linux/signal_benchmark.cc
@@ -0,0 +1,59 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <string.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void FixupHandler(int sig, siginfo_t* si, void* void_ctx) {
+  static unsigned int dataval = 0;
+
+  // Skip the offending instruction.
+  ucontext_t* ctx = reinterpret_cast<ucontext_t*>(void_ctx);
+  ctx->uc_mcontext.gregs[REG_RAX] = reinterpret_cast<greg_t>(&dataval);
+}
+
+void BM_FaultSignalFixup(benchmark::State& state) {
+  // Set up the signal handler.
+  struct sigaction sa = {};
+  sigemptyset(&sa.sa_mask);
+  sa.sa_sigaction = FixupHandler;
+  sa.sa_flags = SA_SIGINFO;
+  TEST_CHECK(sigaction(SIGSEGV, &sa, nullptr) == 0);
+
+  // Fault, fault, fault.
+  for (auto _ : state) {
+    register volatile unsigned int* ptr asm("rax");
+
+    // Trigger the segfault.
+    ptr = nullptr;
+    *ptr = 0;
+  }
+}
+
+BENCHMARK(BM_FaultSignalFixup)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/sleep_benchmark.cc b/test/perf/linux/sleep_benchmark.cc
new file mode 100644
index 000000000..99ef05117
--- /dev/null
+++ b/test/perf/linux/sleep_benchmark.cc
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Sleep for 'param' nanoseconds.
+void BM_Sleep(benchmark::State& state) {
+  const int nanoseconds = state.range(0);
+
+  for (auto _ : state) {
+    struct timespec ts;
+    ts.tv_sec = 0;
+    ts.tv_nsec = nanoseconds;
+
+    int ret;
+    do {
+      ret = syscall(SYS_nanosleep, &ts, &ts);
+      if (ret < 0) {
+        TEST_CHECK(errno == EINTR);
+      }
+    } while (ret < 0);
+  }
+}
+
+BENCHMARK(BM_Sleep)
+    ->Arg(0)
+    ->Arg(1)
+    ->Arg(1000)              // 1us
+    ->Arg(1000 * 1000)       // 1ms
+    ->Arg(10 * 1000 * 1000)  // 10ms
+    ->Arg(50 * 1000 * 1000)  // 50ms
+    ->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/stat_benchmark.cc b/test/perf/linux/stat_benchmark.cc
new file mode 100644
index 000000000..f15424482
--- /dev/null
+++ b/test/perf/linux/stat_benchmark.cc
@@ -0,0 +1,62 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Creates a file in a nested directory hierarchy at least `depth` directories
+// deep, and stats that file multiple times.
+void BM_Stat(benchmark::State& state) {
+  // Create nested directories with given depth.
+  int depth = state.range(0);
+  const TempPath top_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  std::string dir_path = top_dir.path();
+
+  while (depth-- > 0) {
+    // Don't use TempPath because it will make paths too long to use.
+    //
+    // The top_dir destructor will clean up this whole tree.
+    dir_path = JoinPath(dir_path, absl::StrCat(depth));
+    ASSERT_NO_ERRNO(Mkdir(dir_path, 0755));
+  }
+
+  // Create the file that will be stat'd.
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir_path));
+
+  struct stat st;
+  for (auto _ : state) {
+    ASSERT_THAT(stat(file.path().c_str(), &st), SyscallSucceeds());
+  }
+}
+
+BENCHMARK(BM_Stat)->Range(1, 100)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/unlink_benchmark.cc b/test/perf/linux/unlink_benchmark.cc
new file mode 100644
index 000000000..92243a042
--- /dev/null
+++ b/test/perf/linux/unlink_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Creates a directory containing `files` files, and unlinks all the files.
+void BM_Unlink(benchmark::State& state) {
+  // Create directory with given files.
+  const int file_count = state.range(0);
+
+  // We unlink all files on each iteration, but report this as a "batch"
+  // iteration so that reported times are per file.
+  TempPath dir;
+  while (state.KeepRunningBatch(file_count)) {
+    state.PauseTiming();
+    // N.B. dir is declared outside the loop so that destruction of the previous
+    // iteration's directory occurs here, inside of PauseTiming.
+    dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+    std::vector<TempPath> files;
+    for (int i = 0; i < file_count; i++) {
+      TempPath file =
+          ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+      files.push_back(std::move(file));
+    }
+    state.ResumeTiming();
+
+    while (!files.empty()) {
+      // Destructor unlinks.
+      files.pop_back();
+    }
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_Unlink)->Range(1, 100 * 1000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/write_benchmark.cc b/test/perf/linux/write_benchmark.cc
new file mode 100644
index 000000000..7b060c70e
--- /dev/null
+++ b/test/perf/linux/write_benchmark.cc
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Write(benchmark::State& state) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_WRONLY));
+
+  const int size = state.range(0);
+  std::vector<char> buf(size);
+  RandomizeBuffer(buf.data(), size);
+
+  for (auto _ : state) {
+    TEST_CHECK(PwriteFd(fd.get(), buf.data(), size, 0) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Write)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/runner/BUILD b/test/runner/BUILD
new file mode 100644
index 000000000..9959ef9b0
--- /dev/null
+++ b/test/runner/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "runner",
+    testonly = 1,
+    srcs = ["runner.go"],
+    data = [
+        "//runsc",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/log",
+        "//runsc/specutils",
+        "//runsc/testutil",
+        "//test/runner/gtest",
+        "//test/uds",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl
new file mode 100644
index 000000000..5e97c1867
--- /dev/null
+++ b/test/runner/defs.bzl
@@ -0,0 +1,218 @@
+"""Defines a rule for syscall test targets."""
+
+load("//tools:defs.bzl", "loopback")
+
+def _runner_test_impl(ctx):
+    # Generate a runner binary.
+    runner = ctx.actions.declare_file("%s-runner" % ctx.label.name)
+    runner_content = "\n".join([
+        "#!/bin/bash",
+        "set -euf -x -o pipefail",
+        "if [[ -n \"${TEST_UNDECLARED_OUTPUTS_DIR}\" ]]; then",
+        "  mkdir -p \"${TEST_UNDECLARED_OUTPUTS_DIR}\"",
+        "  chmod a+rwx \"${TEST_UNDECLARED_OUTPUTS_DIR}\"",
+        "fi",
+        "exec %s %s %s\n" % (
+            ctx.files.runner[0].short_path,
+            " ".join(ctx.attr.runner_args),
+            ctx.files.test[0].short_path,
+        ),
+    ])
+    ctx.actions.write(runner, runner_content, is_executable = True)
+
+    # Return with all transitive files.
+    runfiles = ctx.runfiles(
+        transitive_files = depset(transitive = [
+            depset(target.data_runfiles.files)
+            for target in (ctx.attr.runner, ctx.attr.test)
+            if hasattr(target, "data_runfiles")
+        ]),
+        files = ctx.files.runner + ctx.files.test,
+        collect_default = True,
+        collect_data = True,
+    )
+    return [DefaultInfo(executable = runner, runfiles = runfiles)]
+
+_runner_test = rule(
+    attrs = {
+        "runner": attr.label(
+            default = "//test/runner:runner",
+        ),
+        "test": attr.label(
+            mandatory = True,
+        ),
+        "runner_args": attr.string_list(),
+        "data": attr.label_list(
+            allow_files = True,
+        ),
+    },
+    test = True,
+    implementation = _runner_test_impl,
+)
+
+def _syscall_test(
+        test,
+        shard_count,
+        size,
+        platform,
+        use_tmpfs,
+        tags,
+        network = "none",
+        file_access = "exclusive",
+        overlay = False,
+        add_uds_tree = False):
+    # Prepend "runsc" to non-native platform names.
+    full_platform = platform if platform == "native" else "runsc_" + platform
+
+    # Name the test appropriately.
+    name = test.split(":")[1] + "_" + full_platform
+    if file_access == "shared":
+        name += "_shared"
+    if overlay:
+        name += "_overlay"
+    if network != "none":
+        name += "_" + network + "net"
+
+    # Apply all tags.
+    if tags == None:
+        tags = []
+
+    # Add the full_platform and file access in a tag to make it easier to run
+    # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
+    tags += [full_platform, "file_" + file_access]
+
+    # Hash this target into one of 15 buckets. This can be used to
+    # randomly split targets between different workflows.
+    hash15 = hash(native.package_name() + name) % 15
+    tags.append("hash15:" + str(hash15))
+
+    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
+    # we figure out how to request ipv4 sockets on Guitar machines.
+    if network == "host":
+        tags.append("noguitar")
+
+    # Disable off-host networking.
+    tags.append("requires-net:loopback")
+
+    # Add tag to prevent the tests from running in a Bazel sandbox.
+    # TODO(b/120560048): Make the tests run without this tag.
+    tags.append("no-sandbox")
+
+    # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
+    # more stable.
+    if platform == "kvm":
+        tags.append("manual")
+        tags.append("requires-kvm")
+
+        # TODO(b/112165693): Remove when tests pass reliably.
+        tags.append("notap")
+
+    runner_args = [
+        # Arguments are passed directly to runner binary.
+        "--platform=" + platform,
+        "--network=" + network,
+        "--use-tmpfs=" + str(use_tmpfs),
+        "--file-access=" + file_access,
+        "--overlay=" + str(overlay),
+        "--add-uds-tree=" + str(add_uds_tree),
+    ]
+
+    # Call the rule above.
+    _runner_test(
+        name = name,
+        test = test,
+        runner_args = runner_args,
+        data = [loopback],
+        size = size,
+        tags = tags,
+        shard_count = shard_count,
+    )
+
+def syscall_test(
+        test,
+        shard_count = 5,
+        size = "small",
+        use_tmpfs = False,
+        add_overlay = False,
+        add_uds_tree = False,
+        add_hostinet = False,
+        tags = None):
+    """syscall_test is a macro that will create targets for all platforms.
+
+    Args:
+      test: the test target.
+      shard_count: shards for defined tests.
+      size: the defined test size.
+      use_tmpfs: use tmpfs in the defined tests.
+      add_overlay: add an overlay test.
+      add_uds_tree: add a UDS test.
+      add_hostinet: add a hostinet test.
+      tags: starting test tags.
+    """
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = "native",
+        use_tmpfs = False,
+        add_uds_tree = add_uds_tree,
+        tags = tags,
+    )
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = "kvm",
+        use_tmpfs = use_tmpfs,
+        add_uds_tree = add_uds_tree,
+        tags = tags,
+    )
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = "ptrace",
+        use_tmpfs = use_tmpfs,
+        add_uds_tree = add_uds_tree,
+        tags = tags,
+    )
+
+    if add_overlay:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = False,  # overlay is adding a writable tmpfs on top of root.
+            add_uds_tree = add_uds_tree,
+            tags = tags,
+            overlay = True,
+        )
+
+    if not use_tmpfs:
+        # Also test shared gofer access.
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = use_tmpfs,
+            add_uds_tree = add_uds_tree,
+            tags = tags,
+            file_access = "shared",
+        )
+
+    if add_hostinet:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = use_tmpfs,
+            network = "host",
+            add_uds_tree = add_uds_tree,
+            tags = tags,
+        )
diff --git a/test/runner/gtest/BUILD b/test/runner/gtest/BUILD
new file mode 100644
index 000000000..de4b2727c
--- /dev/null
+++ b/test/runner/gtest/BUILD
@@ -0,0 +1,9 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "gtest",
+    srcs = ["gtest.go"],
+    visibility = ["//:sandbox"],
+)
diff --git a/test/runner/gtest/gtest.go b/test/runner/gtest/gtest.go
new file mode 100644
index 000000000..23bf7b5f6
--- /dev/null
+++ b/test/runner/gtest/gtest.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gtest contains helpers for running google-test tests from Go.
+package gtest
+
+import (
+	"fmt"
+	"os/exec"
+	"strings"
+)
+
+var (
+	// listTestFlag is the flag that will list tests in gtest binaries.
+	listTestFlag = "--gtest_list_tests"
+
+	// filterTestFlag is the flag that will filter tests in gtest binaries.
+	filterTestFlag = "--gtest_filter"
+
+	// listBechmarkFlag is the flag that will list benchmarks in gtest binaries.
+	listBenchmarkFlag = "--benchmark_list_tests"
+
+	// filterBenchmarkFlag is the flag that will run specified benchmarks.
+	filterBenchmarkFlag = "--benchmark_filter"
+)
+
+// TestCase is a single gtest test case.
+type TestCase struct {
+	// Suite is the suite for this test.
+	Suite string
+
+	// Name is the name of this individual test.
+	Name string
+
+	// benchmark indicates that this is a benchmark. In this case, the
+	// suite will be empty, and we will use the appropriate test and
+	// benchmark flags.
+	benchmark bool
+}
+
+// FullName returns the name of the test including the suite. It is suitable to
+// pass to "-gtest_filter".
+func (tc TestCase) FullName() string {
+	return fmt.Sprintf("%s.%s", tc.Suite, tc.Name)
+}
+
+// Args returns arguments to be passed when invoking the test.
+func (tc TestCase) Args() []string {
+	if tc.benchmark {
+		return []string{
+			fmt.Sprintf("%s=^$", filterTestFlag),
+			fmt.Sprintf("%s=^%s$", filterBenchmarkFlag, tc.Name),
+		}
+	}
+	return []string{
+		fmt.Sprintf("%s=^%s$", filterTestFlag, tc.FullName()),
+		fmt.Sprintf("%s=^$", filterBenchmarkFlag),
+	}
+}
+
+// ParseTestCases calls a gtest test binary to list its test and returns a
+// slice with the name and suite of each test.
+//
+// If benchmarks is true, then benchmarks will be included in the list of test
+// cases provided. Note that this requires the binary to support the
+// benchmarks_list_tests flag.
+func ParseTestCases(testBin string, benchmarks bool, extraArgs ...string) ([]TestCase, error) {
+	// Run to extract test cases.
+	args := append([]string{listTestFlag}, extraArgs...)
+	cmd := exec.Command(testBin, args...)
+	out, err := cmd.Output()
+	if err != nil {
+		exitErr, ok := err.(*exec.ExitError)
+		if !ok {
+			return nil, fmt.Errorf("could not enumerate gtest tests: %v", err)
+		}
+		return nil, fmt.Errorf("could not enumerate gtest tests: %v\nstderr:\n%s", err, exitErr.Stderr)
+	}
+
+	// Parse test output.
+	var t []TestCase
+	var suite string
+	for _, line := range strings.Split(string(out), "\n") {
+		// Strip comments.
+		line = strings.Split(line, "#")[0]
+
+		// New suite?
+		if !strings.HasPrefix(line, " ") {
+			suite = strings.TrimSuffix(strings.TrimSpace(line), ".")
+			continue
+		}
+
+		// Individual test.
+		name := strings.TrimSpace(line)
+
+		// Do we have a suite yet?
+		if suite == "" {
+			return nil, fmt.Errorf("test without a suite: %v", name)
+		}
+
+		// Add this individual test.
+		t = append(t, TestCase{
+			Suite: suite,
+			Name:  name,
+		})
+
+	}
+
+	// Finished?
+	if !benchmarks {
+		return t, nil
+	}
+
+	// Run again to extract benchmarks.
+	args = append([]string{listBenchmarkFlag}, extraArgs...)
+	cmd = exec.Command(testBin, args...)
+	out, err = cmd.Output()
+	if err != nil {
+		exitErr, ok := err.(*exec.ExitError)
+		if !ok {
+			return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v", err)
+		}
+		return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v\nstderr\n%s", err, exitErr.Stderr)
+	}
+
+	// Parse benchmark output.
+	for _, line := range strings.Split(string(out), "\n") {
+		// Strip comments.
+		line = strings.Split(line, "#")[0]
+
+		// Single benchmark.
+		name := strings.TrimSpace(line)
+
+		// Add the single benchmark.
+		t = append(t, TestCase{
+			Suite:     "Benchmarks",
+			Name:      name,
+			benchmark: true,
+		})
+	}
+
+	return t, nil
+}
diff --git a/test/runner/runner.go b/test/runner/runner.go
new file mode 100644
index 000000000..a78ef38e0
--- /dev/null
+++ b/test/runner/runner.go
@@ -0,0 +1,477 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary syscall_test_runner runs the syscall test suites in gVisor
+// containers and on the host platform.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"os/signal"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/specutils"
+	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/test/runner/gtest"
+	"gvisor.dev/gvisor/test/uds"
+)
+
+var (
+	debug      = flag.Bool("debug", false, "enable debug logs")
+	strace     = flag.Bool("strace", false, "enable strace logs")
+	platform   = flag.String("platform", "ptrace", "platform to run on")
+	network    = flag.String("network", "none", "network stack to run on (sandbox, host, none)")
+	useTmpfs   = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
+	fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
+	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
+	parallel   = flag.Bool("parallel", false, "run tests in parallel")
+	runscPath  = flag.String("runsc", "", "path to runsc binary")
+
+	addUDSTree = flag.Bool("add-uds-tree", false, "expose a tree of UDS utilities for use in tests")
+)
+
+// runTestCaseNative runs the test case directly on the host machine.
+func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
+	// These tests might be running in parallel, so make sure they have a
+	// unique test temp dir.
+	tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
+	if err != nil {
+		t.Fatalf("could not create temp dir: %v", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	// Replace TEST_TMPDIR in the current environment with something
+	// unique.
+	env := os.Environ()
+	newEnvVar := "TEST_TMPDIR=" + tmpDir
+	var found bool
+	for i, kv := range env {
+		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
+			env[i] = newEnvVar
+			found = true
+			break
+		}
+	}
+	if !found {
+		env = append(env, newEnvVar)
+	}
+	// Remove env variables that cause the gunit binary to write output
+	// files, since they will stomp on eachother, and on the output files
+	// from this go test.
+	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
+
+	// Remove shard env variables so that the gunit binary does not try to
+	// intepret them.
+	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
+
+	if *addUDSTree {
+		socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
+		if err != nil {
+			t.Fatalf("failed to create socket tree: %v", err)
+		}
+		defer cleanup()
+
+		env = append(env, "TEST_UDS_TREE="+socketDir)
+		// On Linux, the concept of "attach" location doesn't exist.
+		// Just pass the same path to make these test identical.
+		env = append(env, "TEST_UDS_ATTACH_TREE="+socketDir)
+	}
+
+	cmd := exec.Command(testBin, tc.Args()...)
+	cmd.Env = env
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		ws := err.(*exec.ExitError).Sys().(syscall.WaitStatus)
+		t.Errorf("test %q exited with status %d, want 0", tc.FullName(), ws.ExitStatus())
+	}
+}
+
+// runRunsc runs spec in runsc in a standard test configuration.
+//
+// runsc logs will be saved to a path in TEST_UNDECLARED_OUTPUTS_DIR.
+//
+// Returns an error if the sandboxed application exits non-zero.
+func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
+	bundleDir, err := testutil.SetupBundleDir(spec)
+	if err != nil {
+		return fmt.Errorf("SetupBundleDir failed: %v", err)
+	}
+	defer os.RemoveAll(bundleDir)
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		return fmt.Errorf("SetupRootDir failed: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	name := tc.FullName()
+	id := testutil.UniqueContainerID()
+	log.Infof("Running test %q in container %q", name, id)
+	specutils.LogSpec(spec)
+
+	args := []string{
+		"-root", rootDir,
+		"-network", *network,
+		"-log-format=text",
+		"-TESTONLY-unsafe-nonroot=true",
+		"-net-raw=true",
+		fmt.Sprintf("-panic-signal=%d", syscall.SIGTERM),
+		"-watchdog-action=panic",
+		"-platform", *platform,
+		"-file-access", *fileAccess,
+	}
+	if *overlay {
+		args = append(args, "-overlay")
+	}
+	if *debug {
+		args = append(args, "-debug", "-log-packets=true")
+	}
+	if *strace {
+		args = append(args, "-strace")
+	}
+	if *addUDSTree {
+		args = append(args, "-fsgofer-host-uds")
+	}
+
+	if outDir, ok := syscall.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
+		tdir := filepath.Join(outDir, strings.Replace(name, "/", "_", -1))
+		if err := os.MkdirAll(tdir, 0755); err != nil {
+			return fmt.Errorf("could not create test dir: %v", err)
+		}
+		debugLogDir, err := ioutil.TempDir(tdir, "runsc")
+		if err != nil {
+			return fmt.Errorf("could not create temp dir: %v", err)
+		}
+		debugLogDir += "/"
+		log.Infof("runsc logs: %s", debugLogDir)
+		args = append(args, "-debug-log", debugLogDir)
+
+		// Default -log sends messages to stderr which makes reading the test log
+		// difficult. Instead, drop them when debug log is enabled given it's a
+		// better place for these messages.
+		args = append(args, "-log=/dev/null")
+	}
+
+	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
+	// as root inside that namespace to get it.
+	rArgs := append(args, "run", "--bundle", bundleDir, id)
+	cmd := exec.Command(*runscPath, rArgs...)
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
+		// Set current user/group as root inside the namespace.
+		UidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
+		},
+		GidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
+		},
+		GidMappingsEnableSetgroups: false,
+		Credential: &syscall.Credential{
+			Uid: 0,
+			Gid: 0,
+		},
+	}
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	sig := make(chan os.Signal, 1)
+	signal.Notify(sig, syscall.SIGTERM)
+	go func() {
+		s, ok := <-sig
+		if !ok {
+			return
+		}
+		log.Warningf("%s: Got signal: %v", name, s)
+		done := make(chan bool)
+		dArgs := append([]string{}, args...)
+		dArgs = append(dArgs, "-alsologtostderr=true", "debug", "--stacks", id)
+		go func(dArgs []string) {
+			cmd := exec.Command(*runscPath, dArgs...)
+			cmd.Stdout = os.Stdout
+			cmd.Stderr = os.Stderr
+			cmd.Run()
+			done <- true
+		}(dArgs)
+
+		timeout := time.After(3 * time.Second)
+		select {
+		case <-timeout:
+			log.Infof("runsc debug --stacks is timeouted")
+		case <-done:
+		}
+
+		log.Warningf("Send SIGTERM to the sandbox process")
+		dArgs = append(args, "debug",
+			fmt.Sprintf("--signal=%d", syscall.SIGTERM),
+			id)
+		cmd := exec.Command(*runscPath, dArgs...)
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+		cmd.Run()
+	}()
+
+	err = cmd.Run()
+
+	signal.Stop(sig)
+	close(sig)
+
+	return err
+}
+
+// setupUDSTree updates the spec to expose a UDS tree for gofer socket testing.
+func setupUDSTree(spec *specs.Spec) (cleanup func(), err error) {
+	socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
+	if err != nil {
+		return nil, fmt.Errorf("failed to create socket tree: %v", err)
+	}
+
+	// Standard access to entire tree.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets",
+		Source:      socketDir,
+		Type:        "bind",
+	})
+
+	// Individial attach points for each socket to test mounts that attach
+	// directly to the sockets.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/stream/echo",
+		Source:      filepath.Join(socketDir, "stream/echo"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/stream/nonlistening",
+		Source:      filepath.Join(socketDir, "stream/nonlistening"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/seqpacket/echo",
+		Source:      filepath.Join(socketDir, "seqpacket/echo"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/seqpacket/nonlistening",
+		Source:      filepath.Join(socketDir, "seqpacket/nonlistening"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/dgram/null",
+		Source:      filepath.Join(socketDir, "dgram/null"),
+		Type:        "bind",
+	})
+
+	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_TREE=/tmp/sockets")
+	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_ATTACH_TREE=/tmp/sockets-attach")
+
+	return cleanup, nil
+}
+
+// runsTestCaseRunsc runs the test case in runsc.
+func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
+	// Run a new container with the test executable and filter for the
+	// given test suite and name.
+	spec := testutil.NewSpecWithArgs(append([]string{testBin}, tc.Args()...)...)
+
+	// Mark the root as writeable, as some tests attempt to
+	// write to the rootfs, and expect EACCES, not EROFS.
+	spec.Root.Readonly = false
+
+	// Test spec comes with pre-defined mounts that we don't want. Reset it.
+	spec.Mounts = nil
+	if *useTmpfs {
+		// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on
+		// features only available in gVisor's internal tmpfs may fail.
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: "/tmp",
+			Type:        "tmpfs",
+		})
+	} else {
+		// Use a gofer-backed directory as '/tmp'.
+		//
+		// Tests might be running in parallel, so make sure each has a
+		// unique test temp dir.
+		//
+		// Some tests (e.g., sticky) access this mount from other
+		// users, so make sure it is world-accessible.
+		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
+		if err != nil {
+			t.Fatalf("could not create temp dir: %v", err)
+		}
+		defer os.RemoveAll(tmpDir)
+
+		if err := os.Chmod(tmpDir, 0777); err != nil {
+			t.Fatalf("could not chmod temp dir: %v", err)
+		}
+
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Type:        "bind",
+			Destination: "/tmp",
+			Source:      tmpDir,
+		})
+	}
+
+	// Set environment variables that indicate we are
+	// running in gVisor with the given platform and network.
+	platformVar := "TEST_ON_GVISOR"
+	networkVar := "GVISOR_NETWORK"
+	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network)
+
+	// Remove env variables that cause the gunit binary to write output
+	// files, since they will stomp on eachother, and on the output files
+	// from this go test.
+	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
+
+	// Remove shard env variables so that the gunit binary does not try to
+	// intepret them.
+	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
+
+	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
+	// be backed by tmpfs.
+	for i, kv := range env {
+		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
+			env[i] = "TEST_TMPDIR=/tmp"
+			break
+		}
+	}
+
+	spec.Process.Env = env
+
+	if *addUDSTree {
+		cleanup, err := setupUDSTree(spec)
+		if err != nil {
+			t.Fatalf("error creating UDS tree: %v", err)
+		}
+		defer cleanup()
+	}
+
+	if err := runRunsc(tc, spec); err != nil {
+		t.Errorf("test %q failed with error %v, want nil", tc.FullName(), err)
+	}
+}
+
+// filterEnv returns an environment with the blacklisted variables removed.
+func filterEnv(env, blacklist []string) []string {
+	var out []string
+	for _, kv := range env {
+		ok := true
+		for _, k := range blacklist {
+			if strings.HasPrefix(kv, k+"=") {
+				ok = false
+				break
+			}
+		}
+		if ok {
+			out = append(out, kv)
+		}
+	}
+	return out
+}
+
+func fatalf(s string, args ...interface{}) {
+	fmt.Fprintf(os.Stderr, s+"\n", args...)
+	os.Exit(1)
+}
+
+func matchString(a, b string) (bool, error) {
+	return a == b, nil
+}
+
+func main() {
+	flag.Parse()
+	if flag.NArg() != 1 {
+		fatalf("test must be provided")
+	}
+	testBin := flag.Args()[0] // Only argument.
+
+	log.SetLevel(log.Info)
+	if *debug {
+		log.SetLevel(log.Debug)
+	}
+
+	if *platform != "native" && *runscPath == "" {
+		if err := testutil.ConfigureExePath(); err != nil {
+			panic(err.Error())
+		}
+		*runscPath = specutils.ExePath
+	}
+
+	// Make sure stdout and stderr are opened with O_APPEND, otherwise logs
+	// from outside the sandbox can (and will) stomp on logs from inside
+	// the sandbox.
+	for _, f := range []*os.File{os.Stdout, os.Stderr} {
+		flags, err := unix.FcntlInt(f.Fd(), unix.F_GETFL, 0)
+		if err != nil {
+			fatalf("error getting file flags for %v: %v", f, err)
+		}
+		if flags&unix.O_APPEND == 0 {
+			flags |= unix.O_APPEND
+			if _, err := unix.FcntlInt(f.Fd(), unix.F_SETFL, flags); err != nil {
+				fatalf("error setting file flags for %v: %v", f, err)
+			}
+		}
+	}
+
+	// Get all test cases in each binary.
+	testCases, err := gtest.ParseTestCases(testBin, true)
+	if err != nil {
+		fatalf("ParseTestCases(%q) failed: %v", testBin, err)
+	}
+
+	// Get subset of tests corresponding to shard.
+	indices, err := testutil.TestIndicesForShard(len(testCases))
+	if err != nil {
+		fatalf("TestsForShard() failed: %v", err)
+	}
+
+	// Resolve the absolute path for the binary.
+	testBin, err = filepath.Abs(testBin)
+	if err != nil {
+		fatalf("Abs() failed: %v", err)
+	}
+
+	// Run the tests.
+	var tests []testing.InternalTest
+	for _, tci := range indices {
+		// Capture tc.
+		tc := testCases[tci]
+		tests = append(tests, testing.InternalTest{
+			Name: fmt.Sprintf("%s_%s", tc.Suite, tc.Name),
+			F: func(t *testing.T) {
+				if *parallel {
+					t.Parallel()
+				}
+				if *platform == "native" {
+					// Run the test case on host.
+					runTestCaseNative(testBin, tc, t)
+				} else {
+					// Run the test case in runsc.
+					runTestCaseRunsc(testBin, tc, t)
+				}
+			},
+		})
+	}
+
+	testing.Main(matchString, tests, nil, nil)
+}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 31d239c0e..d69ac8356 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -1,5 +1,4 @@
-load("//tools:defs.bzl", "go_binary")
-load("//test/syscalls:build_defs.bzl", "syscall_test")
+load("//test/runner:defs.bzl", "syscall_test")
 
 package(licenses = ["notice"])
 
@@ -726,21 +725,3 @@ syscall_test(test = "//test/syscalls/linux:proc_net_unix_test")
 syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
 
 syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
-
-go_binary(
-    name = "syscall_test_runner",
-    testonly = 1,
-    srcs = ["syscall_test_runner.go"],
-    data = [
-        "//runsc",
-    ],
-    deps = [
-        "//pkg/log",
-        "//runsc/specutils",
-        "//runsc/testutil",
-        "//test/syscalls/gtest",
-        "//test/uds",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-        "@org_golang_x_sys//unix:go_default_library",
-    ],
-)
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
deleted file mode 100644
index cbab85ef7..000000000
--- a/test/syscalls/build_defs.bzl
+++ /dev/null
@@ -1,180 +0,0 @@
-"""Defines a rule for syscall test targets."""
-
-load("//tools:defs.bzl", "loopback")
-
-def syscall_test(
-        test,
-        shard_count = 5,
-        size = "small",
-        use_tmpfs = False,
-        add_overlay = False,
-        add_uds_tree = False,
-        add_hostinet = False,
-        tags = None):
-    """syscall_test is a macro that will create targets for all platforms.
-
-    Args:
-      test: the test target.
-      shard_count: shards for defined tests.
-      size: the defined test size.
-      use_tmpfs: use tmpfs in the defined tests.
-      add_overlay: add an overlay test.
-      add_uds_tree: add a UDS test.
-      add_hostinet: add a hostinet test.
-      tags: starting test tags.
-    """
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "native",
-        use_tmpfs = False,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "kvm",
-        use_tmpfs = use_tmpfs,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "ptrace",
-        use_tmpfs = use_tmpfs,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    if add_overlay:
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = False,  # overlay is adding a writable tmpfs on top of root.
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-            overlay = True,
-        )
-
-    if not use_tmpfs:
-        # Also test shared gofer access.
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = use_tmpfs,
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-            file_access = "shared",
-        )
-
-    if add_hostinet:
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = use_tmpfs,
-            network = "host",
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-        )
-
-def _syscall_test(
-        test,
-        shard_count,
-        size,
-        platform,
-        use_tmpfs,
-        tags,
-        network = "none",
-        file_access = "exclusive",
-        overlay = False,
-        add_uds_tree = False):
-    test_name = test.split(":")[1]
-
-    # Prepend "runsc" to non-native platform names.
-    full_platform = platform if platform == "native" else "runsc_" + platform
-
-    name = test_name + "_" + full_platform
-    if file_access == "shared":
-        name += "_shared"
-    if overlay:
-        name += "_overlay"
-    if network != "none":
-        name += "_" + network + "net"
-
-    if tags == None:
-        tags = []
-
-    # Add the full_platform and file access in a tag to make it easier to run
-    # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
-    tags += [full_platform, "file_" + file_access]
-
-    # Hash this target into one of 15 buckets. This can be used to
-    # randomly split targets between different workflows.
-    hash15 = hash(native.package_name() + name) % 15
-    tags.append("hash15:" + str(hash15))
-
-    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
-    # we figure out how to request ipv4 sockets on Guitar machines.
-    if network == "host":
-        tags.append("noguitar")
-
-    # Disable off-host networking.
-    tags.append("requires-net:loopback")
-
-    # Add tag to prevent the tests from running in a Bazel sandbox.
-    # TODO(b/120560048): Make the tests run without this tag.
-    tags.append("no-sandbox")
-
-    # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
-    # more stable.
-    if platform == "kvm":
-        tags.append("manual")
-        tags.append("requires-kvm")
-
-        # TODO(b/112165693): Remove when tests pass reliably.
-        tags.append("notap")
-
-    args = [
-        # Arguments are passed directly to syscall_test_runner binary.
-        "--test-name=" + test_name,
-        "--platform=" + platform,
-        "--network=" + network,
-        "--use-tmpfs=" + str(use_tmpfs),
-        "--file-access=" + file_access,
-        "--overlay=" + str(overlay),
-        "--add-uds-tree=" + str(add_uds_tree),
-    ]
-
-    sh_test(
-        srcs = ["syscall_test_runner.sh"],
-        name = name,
-        data = [
-            ":syscall_test_runner",
-            loopback,
-            test,
-        ],
-        args = args,
-        size = size,
-        tags = tags,
-        shard_count = shard_count,
-    )
-
-def sh_test(**kwargs):
-    """Wraps the standard sh_test."""
-    native.sh_test(
-        **kwargs
-    )
diff --git a/test/syscalls/gtest/BUILD b/test/syscalls/gtest/BUILD
deleted file mode 100644
index de4b2727c..000000000
--- a/test/syscalls/gtest/BUILD
+++ /dev/null
@@ -1,9 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "gtest",
-    srcs = ["gtest.go"],
-    visibility = ["//:sandbox"],
-)
diff --git a/test/syscalls/gtest/gtest.go b/test/syscalls/gtest/gtest.go
deleted file mode 100644
index bdec8eb07..000000000
--- a/test/syscalls/gtest/gtest.go
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package gtest contains helpers for running google-test tests from Go.
-package gtest
-
-import (
-	"fmt"
-	"os/exec"
-	"strings"
-)
-
-var (
-	// ListTestFlag is the flag that will list tests in gtest binaries.
-	ListTestFlag = "--gtest_list_tests"
-
-	// FilterTestFlag is the flag that will filter tests in gtest binaries.
-	FilterTestFlag = "--gtest_filter"
-)
-
-// TestCase is a single gtest test case.
-type TestCase struct {
-	// Suite is the suite for this test.
-	Suite string
-
-	// Name is the name of this individual test.
-	Name string
-}
-
-// FullName returns the name of the test including the suite. It is suitable to
-// pass to "-gtest_filter".
-func (tc TestCase) FullName() string {
-	return fmt.Sprintf("%s.%s", tc.Suite, tc.Name)
-}
-
-// ParseTestCases calls a gtest test binary to list its test and returns a
-// slice with the name and suite of each test.
-func ParseTestCases(testBin string, extraArgs ...string) ([]TestCase, error) {
-	args := append([]string{ListTestFlag}, extraArgs...)
-	cmd := exec.Command(testBin, args...)
-	out, err := cmd.Output()
-	if err != nil {
-		exitErr, ok := err.(*exec.ExitError)
-		if !ok {
-			return nil, fmt.Errorf("could not enumerate gtest tests: %v", err)
-		}
-		return nil, fmt.Errorf("could not enumerate gtest tests: %v\nstderr:\n%s", err, exitErr.Stderr)
-	}
-
-	var t []TestCase
-	var suite string
-	for _, line := range strings.Split(string(out), "\n") {
-		// Strip comments.
-		line = strings.Split(line, "#")[0]
-
-		// New suite?
-		if !strings.HasPrefix(line, " ") {
-			suite = strings.TrimSuffix(strings.TrimSpace(line), ".")
-			continue
-		}
-
-		// Individual test.
-		name := strings.TrimSpace(line)
-
-		// Do we have a suite yet?
-		if suite == "" {
-			return nil, fmt.Errorf("test without a suite: %v", name)
-		}
-
-		// Add this individual test.
-		t = append(t, TestCase{
-			Suite: suite,
-			Name:  name,
-		})
-
-	}
-
-	if len(t) == 0 {
-		return nil, fmt.Errorf("no tests parsed from %v", testBin)
-	}
-	return t, nil
-}
diff --git a/test/syscalls/linux/alarm.cc b/test/syscalls/linux/alarm.cc
index d89269985..940c97285 100644
--- a/test/syscalls/linux/alarm.cc
+++ b/test/syscalls/linux/alarm.cc
@@ -188,6 +188,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index b5e0a512b..07bd527e6 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -868,6 +868,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 421c15b87..c7cc5816e 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -1128,5 +1128,5 @@ int main(int argc, char** argv) {
     exit(err);
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index b77e4cbd1..8b48f0804 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -349,6 +349,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
index d07571a5f..04c5161f5 100644
--- a/test/syscalls/linux/prctl.cc
+++ b/test/syscalls/linux/prctl.cc
@@ -226,5 +226,5 @@ int main(int argc, char** argv) {
          prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0));
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/prctl_setuid.cc b/test/syscalls/linux/prctl_setuid.cc
index 30f0d75b3..c4e9cf528 100644
--- a/test/syscalls/linux/prctl_setuid.cc
+++ b/test/syscalls/linux/prctl_setuid.cc
@@ -264,5 +264,5 @@ int main(int argc, char** argv) {
            prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0);
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index a23fdb58d..f91187e75 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -2076,5 +2076,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 4dd5cf27b..bfe3e2603 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -1208,5 +1208,5 @@ int main(int argc, char** argv) {
     gvisor::testing::RunExecveChild();
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/rtsignal.cc b/test/syscalls/linux/rtsignal.cc
index 81d193ffd..ed27e2566 100644
--- a/test/syscalls/linux/rtsignal.cc
+++ b/test/syscalls/linux/rtsignal.cc
@@ -167,6 +167,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 2c947feb7..cf6499f8b 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -411,5 +411,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
index 4deb1ae95..6227774a4 100644
--- a/test/syscalls/linux/sigiret.cc
+++ b/test/syscalls/linux/sigiret.cc
@@ -132,6 +132,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc
index 95be4b66c..389e5fca2 100644
--- a/test/syscalls/linux/signalfd.cc
+++ b/test/syscalls/linux/signalfd.cc
@@ -369,5 +369,5 @@ int main(int argc, char** argv) {
 
   gvisor::testing::TestInit(&argc, &argv);
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sigstop.cc b/test/syscalls/linux/sigstop.cc
index 7db57d968..b2fcedd62 100644
--- a/test/syscalls/linux/sigstop.cc
+++ b/test/syscalls/linux/sigstop.cc
@@ -147,5 +147,5 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sigtimedwait.cc b/test/syscalls/linux/sigtimedwait.cc
index 1e5bf5942..4f8afff15 100644
--- a/test/syscalls/linux/sigtimedwait.cc
+++ b/test/syscalls/linux/sigtimedwait.cc
@@ -319,6 +319,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
index 2f92c27da..4b3c44527 100644
--- a/test/syscalls/linux/timers.cc
+++ b/test/syscalls/linux/timers.cc
@@ -658,5 +658,5 @@ int main(int argc, char** argv) {
     }
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/vfork.cc b/test/syscalls/linux/vfork.cc
index 0aaba482d..19d05998e 100644
--- a/test/syscalls/linux/vfork.cc
+++ b/test/syscalls/linux/vfork.cc
@@ -191,5 +191,5 @@ int main(int argc, char** argv) {
     return gvisor::testing::RunChild();
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
deleted file mode 100644
index ae342b68c..000000000
--- a/test/syscalls/syscall_test_runner.go
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Binary syscall_test_runner runs the syscall test suites in gVisor
-// containers and on the host platform.
-package main
-
-import (
-	"flag"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"os/exec"
-	"os/signal"
-	"path/filepath"
-	"strings"
-	"syscall"
-	"testing"
-	"time"
-
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
-	"gvisor.dev/gvisor/test/syscalls/gtest"
-	"gvisor.dev/gvisor/test/uds"
-)
-
-// Location of syscall tests, relative to the repo root.
-const testDir = "test/syscalls/linux"
-
-var (
-	testName   = flag.String("test-name", "", "name of test binary to run")
-	debug      = flag.Bool("debug", false, "enable debug logs")
-	strace     = flag.Bool("strace", false, "enable strace logs")
-	platform   = flag.String("platform", "ptrace", "platform to run on")
-	network    = flag.String("network", "none", "network stack to run on (sandbox, host, none)")
-	useTmpfs   = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
-	fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
-	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
-	parallel   = flag.Bool("parallel", false, "run tests in parallel")
-	runscPath  = flag.String("runsc", "", "path to runsc binary")
-
-	addUDSTree = flag.Bool("add-uds-tree", false, "expose a tree of UDS utilities for use in tests")
-)
-
-// runTestCaseNative runs the test case directly on the host machine.
-func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
-	// These tests might be running in parallel, so make sure they have a
-	// unique test temp dir.
-	tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
-	if err != nil {
-		t.Fatalf("could not create temp dir: %v", err)
-	}
-	defer os.RemoveAll(tmpDir)
-
-	// Replace TEST_TMPDIR in the current environment with something
-	// unique.
-	env := os.Environ()
-	newEnvVar := "TEST_TMPDIR=" + tmpDir
-	var found bool
-	for i, kv := range env {
-		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
-			env[i] = newEnvVar
-			found = true
-			break
-		}
-	}
-	if !found {
-		env = append(env, newEnvVar)
-	}
-	// Remove env variables that cause the gunit binary to write output
-	// files, since they will stomp on eachother, and on the output files
-	// from this go test.
-	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
-
-	// Remove shard env variables so that the gunit binary does not try to
-	// intepret them.
-	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
-
-	if *addUDSTree {
-		socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
-		if err != nil {
-			t.Fatalf("failed to create socket tree: %v", err)
-		}
-		defer cleanup()
-
-		env = append(env, "TEST_UDS_TREE="+socketDir)
-		// On Linux, the concept of "attach" location doesn't exist.
-		// Just pass the same path to make these test identical.
-		env = append(env, "TEST_UDS_ATTACH_TREE="+socketDir)
-	}
-
-	cmd := exec.Command(testBin, gtest.FilterTestFlag+"="+tc.FullName())
-	cmd.Env = env
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	if err := cmd.Run(); err != nil {
-		ws := err.(*exec.ExitError).Sys().(syscall.WaitStatus)
-		t.Errorf("test %q exited with status %d, want 0", tc.FullName(), ws.ExitStatus())
-	}
-}
-
-// runRunsc runs spec in runsc in a standard test configuration.
-//
-// runsc logs will be saved to a path in TEST_UNDECLARED_OUTPUTS_DIR.
-//
-// Returns an error if the sandboxed application exits non-zero.
-func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
-	bundleDir, err := testutil.SetupBundleDir(spec)
-	if err != nil {
-		return fmt.Errorf("SetupBundleDir failed: %v", err)
-	}
-	defer os.RemoveAll(bundleDir)
-
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		return fmt.Errorf("SetupRootDir failed: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	name := tc.FullName()
-	id := testutil.UniqueContainerID()
-	log.Infof("Running test %q in container %q", name, id)
-	specutils.LogSpec(spec)
-
-	args := []string{
-		"-root", rootDir,
-		"-network", *network,
-		"-log-format=text",
-		"-TESTONLY-unsafe-nonroot=true",
-		"-net-raw=true",
-		fmt.Sprintf("-panic-signal=%d", syscall.SIGTERM),
-		"-watchdog-action=panic",
-		"-platform", *platform,
-		"-file-access", *fileAccess,
-	}
-	if *overlay {
-		args = append(args, "-overlay")
-	}
-	if *debug {
-		args = append(args, "-debug", "-log-packets=true")
-	}
-	if *strace {
-		args = append(args, "-strace")
-	}
-	if *addUDSTree {
-		args = append(args, "-fsgofer-host-uds")
-	}
-
-	if outDir, ok := syscall.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
-		tdir := filepath.Join(outDir, strings.Replace(name, "/", "_", -1))
-		if err := os.MkdirAll(tdir, 0755); err != nil {
-			return fmt.Errorf("could not create test dir: %v", err)
-		}
-		debugLogDir, err := ioutil.TempDir(tdir, "runsc")
-		if err != nil {
-			return fmt.Errorf("could not create temp dir: %v", err)
-		}
-		debugLogDir += "/"
-		log.Infof("runsc logs: %s", debugLogDir)
-		args = append(args, "-debug-log", debugLogDir)
-
-		// Default -log sends messages to stderr which makes reading the test log
-		// difficult. Instead, drop them when debug log is enabled given it's a
-		// better place for these messages.
-		args = append(args, "-log=/dev/null")
-	}
-
-	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
-	// as root inside that namespace to get it.
-	rArgs := append(args, "run", "--bundle", bundleDir, id)
-	cmd := exec.Command(*runscPath, rArgs...)
-	cmd.SysProcAttr = &syscall.SysProcAttr{
-		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
-		// Set current user/group as root inside the namespace.
-		UidMappings: []syscall.SysProcIDMap{
-			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
-		},
-		GidMappings: []syscall.SysProcIDMap{
-			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
-		},
-		GidMappingsEnableSetgroups: false,
-		Credential: &syscall.Credential{
-			Uid: 0,
-			Gid: 0,
-		},
-	}
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	sig := make(chan os.Signal, 1)
-	signal.Notify(sig, syscall.SIGTERM)
-	go func() {
-		s, ok := <-sig
-		if !ok {
-			return
-		}
-		log.Warningf("%s: Got signal: %v", name, s)
-		done := make(chan bool)
-		dArgs := append([]string{}, args...)
-		dArgs = append(dArgs, "-alsologtostderr=true", "debug", "--stacks", id)
-		go func(dArgs []string) {
-			cmd := exec.Command(*runscPath, dArgs...)
-			cmd.Stdout = os.Stdout
-			cmd.Stderr = os.Stderr
-			cmd.Run()
-			done <- true
-		}(dArgs)
-
-		timeout := time.After(3 * time.Second)
-		select {
-		case <-timeout:
-			log.Infof("runsc debug --stacks is timeouted")
-		case <-done:
-		}
-
-		log.Warningf("Send SIGTERM to the sandbox process")
-		dArgs = append(args, "debug",
-			fmt.Sprintf("--signal=%d", syscall.SIGTERM),
-			id)
-		cmd := exec.Command(*runscPath, dArgs...)
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
-		cmd.Run()
-	}()
-
-	err = cmd.Run()
-
-	signal.Stop(sig)
-	close(sig)
-
-	return err
-}
-
-// setupUDSTree updates the spec to expose a UDS tree for gofer socket testing.
-func setupUDSTree(spec *specs.Spec) (cleanup func(), err error) {
-	socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
-	if err != nil {
-		return nil, fmt.Errorf("failed to create socket tree: %v", err)
-	}
-
-	// Standard access to entire tree.
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets",
-		Source:      socketDir,
-		Type:        "bind",
-	})
-
-	// Individial attach points for each socket to test mounts that attach
-	// directly to the sockets.
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/stream/echo",
-		Source:      filepath.Join(socketDir, "stream/echo"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/stream/nonlistening",
-		Source:      filepath.Join(socketDir, "stream/nonlistening"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/seqpacket/echo",
-		Source:      filepath.Join(socketDir, "seqpacket/echo"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/seqpacket/nonlistening",
-		Source:      filepath.Join(socketDir, "seqpacket/nonlistening"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/dgram/null",
-		Source:      filepath.Join(socketDir, "dgram/null"),
-		Type:        "bind",
-	})
-
-	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_TREE=/tmp/sockets")
-	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_ATTACH_TREE=/tmp/sockets-attach")
-
-	return cleanup, nil
-}
-
-// runsTestCaseRunsc runs the test case in runsc.
-func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
-	// Run a new container with the test executable and filter for the
-	// given test suite and name.
-	spec := testutil.NewSpecWithArgs(testBin, gtest.FilterTestFlag+"="+tc.FullName())
-
-	// Mark the root as writeable, as some tests attempt to
-	// write to the rootfs, and expect EACCES, not EROFS.
-	spec.Root.Readonly = false
-
-	// Test spec comes with pre-defined mounts that we don't want. Reset it.
-	spec.Mounts = nil
-	if *useTmpfs {
-		// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on
-		// features only available in gVisor's internal tmpfs may fail.
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: "/tmp",
-			Type:        "tmpfs",
-		})
-	} else {
-		// Use a gofer-backed directory as '/tmp'.
-		//
-		// Tests might be running in parallel, so make sure each has a
-		// unique test temp dir.
-		//
-		// Some tests (e.g., sticky) access this mount from other
-		// users, so make sure it is world-accessible.
-		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
-		if err != nil {
-			t.Fatalf("could not create temp dir: %v", err)
-		}
-		defer os.RemoveAll(tmpDir)
-
-		if err := os.Chmod(tmpDir, 0777); err != nil {
-			t.Fatalf("could not chmod temp dir: %v", err)
-		}
-
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: "/tmp",
-			Source:      tmpDir,
-		})
-	}
-
-	// Set environment variables that indicate we are
-	// running in gVisor with the given platform and network.
-	platformVar := "TEST_ON_GVISOR"
-	networkVar := "GVISOR_NETWORK"
-	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network)
-
-	// Remove env variables that cause the gunit binary to write output
-	// files, since they will stomp on eachother, and on the output files
-	// from this go test.
-	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
-
-	// Remove shard env variables so that the gunit binary does not try to
-	// intepret them.
-	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
-
-	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
-	// be backed by tmpfs.
-	for i, kv := range env {
-		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
-			env[i] = "TEST_TMPDIR=/tmp"
-			break
-		}
-	}
-
-	spec.Process.Env = env
-
-	if *addUDSTree {
-		cleanup, err := setupUDSTree(spec)
-		if err != nil {
-			t.Fatalf("error creating UDS tree: %v", err)
-		}
-		defer cleanup()
-	}
-
-	if err := runRunsc(tc, spec); err != nil {
-		t.Errorf("test %q failed with error %v, want nil", tc.FullName(), err)
-	}
-}
-
-// filterEnv returns an environment with the blacklisted variables removed.
-func filterEnv(env, blacklist []string) []string {
-	var out []string
-	for _, kv := range env {
-		ok := true
-		for _, k := range blacklist {
-			if strings.HasPrefix(kv, k+"=") {
-				ok = false
-				break
-			}
-		}
-		if ok {
-			out = append(out, kv)
-		}
-	}
-	return out
-}
-
-func fatalf(s string, args ...interface{}) {
-	fmt.Fprintf(os.Stderr, s+"\n", args...)
-	os.Exit(1)
-}
-
-func matchString(a, b string) (bool, error) {
-	return a == b, nil
-}
-
-func main() {
-	flag.Parse()
-	if *testName == "" {
-		fatalf("test-name flag must be provided")
-	}
-
-	log.SetLevel(log.Info)
-	if *debug {
-		log.SetLevel(log.Debug)
-	}
-
-	if *platform != "native" && *runscPath == "" {
-		if err := testutil.ConfigureExePath(); err != nil {
-			panic(err.Error())
-		}
-		*runscPath = specutils.ExePath
-	}
-
-	// Make sure stdout and stderr are opened with O_APPEND, otherwise logs
-	// from outside the sandbox can (and will) stomp on logs from inside
-	// the sandbox.
-	for _, f := range []*os.File{os.Stdout, os.Stderr} {
-		flags, err := unix.FcntlInt(f.Fd(), unix.F_GETFL, 0)
-		if err != nil {
-			fatalf("error getting file flags for %v: %v", f, err)
-		}
-		if flags&unix.O_APPEND == 0 {
-			flags |= unix.O_APPEND
-			if _, err := unix.FcntlInt(f.Fd(), unix.F_SETFL, flags); err != nil {
-				fatalf("error setting file flags for %v: %v", f, err)
-			}
-		}
-	}
-
-	// Get path to test binary.
-	fullTestName := filepath.Join(testDir, *testName)
-	testBin, err := testutil.FindFile(fullTestName)
-	if err != nil {
-		fatalf("FindFile(%q) failed: %v", fullTestName, err)
-	}
-
-	// Get all test cases in each binary.
-	testCases, err := gtest.ParseTestCases(testBin)
-	if err != nil {
-		fatalf("ParseTestCases(%q) failed: %v", testBin, err)
-	}
-
-	// Get subset of tests corresponding to shard.
-	indices, err := testutil.TestIndicesForShard(len(testCases))
-	if err != nil {
-		fatalf("TestsForShard() failed: %v", err)
-	}
-
-	// Run the tests.
-	var tests []testing.InternalTest
-	for _, tci := range indices {
-		// Capture tc.
-		tc := testCases[tci]
-		testName := fmt.Sprintf("%s_%s", tc.Suite, tc.Name)
-		tests = append(tests, testing.InternalTest{
-			Name: testName,
-			F: func(t *testing.T) {
-				if *parallel {
-					t.Parallel()
-				}
-				if *platform == "native" {
-					// Run the test case on host.
-					runTestCaseNative(testBin, tc, t)
-				} else {
-					// Run the test case in runsc.
-					runTestCaseRunsc(testBin, tc, t)
-				}
-			},
-		})
-	}
-
-	testing.Main(matchString, tests, nil, nil)
-}
diff --git a/test/syscalls/syscall_test_runner.sh b/test/syscalls/syscall_test_runner.sh
deleted file mode 100755
index 864bb2de4..000000000
--- a/test/syscalls/syscall_test_runner.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# syscall_test_runner.sh is a simple wrapper around the go syscall test runner.
-# It exists so that we can build the syscall test runner once, and use it for
-# all syscall tests, rather than build it for each test run.
-
-set -euf -x -o pipefail
-
-echo -- "$@"
-
-if [[ -n "${TEST_UNDECLARED_OUTPUTS_DIR}" ]]; then
-  mkdir -p "${TEST_UNDECLARED_OUTPUTS_DIR}"
-  chmod a+rwx "${TEST_UNDECLARED_OUTPUTS_DIR}"
-fi
-
-# Get location of syscall_test_runner binary.
-readonly runner=$(find "${TEST_SRCDIR}" -name syscall_test_runner)
-
-# Pass the arguments of this script directly to the runner.
-exec "${runner}" "$@"
diff --git a/test/util/BUILD b/test/util/BUILD
index 1f22ebe29..8b5a0f25c 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_library", "cc_test", "gtest", "select_system")
+load("//tools:defs.bzl", "cc_library", "cc_test", "gbenchmark", "gtest", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -260,6 +260,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
         gtest,
+        gbenchmark,
     ],
 )
 
diff --git a/test/util/test_main.cc b/test/util/test_main.cc
index 5c7ee0064..1f389e58f 100644
--- a/test/util/test_main.cc
+++ b/test/util/test_main.cc
@@ -16,5 +16,5 @@
 
 int main(int argc, char** argv) {
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/util/test_util.h b/test/util/test_util.h
index 2d22b0eb8..c5cb9d6d6 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -771,6 +771,7 @@ std::string RunfilePath(std::string path);
 #endif
 
 void TestInit(int* argc, char*** argv);
+int RunAllTests(void);
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/test_util_impl.cc b/test/util/test_util_impl.cc
index ba7c0a85b..7e1ad9e66 100644
--- a/test/util/test_util_impl.cc
+++ b/test/util/test_util_impl.cc
@@ -17,8 +17,12 @@
 #include "gtest/gtest.h"
 #include "absl/flags/flag.h"
 #include "absl/flags/parse.h"
+#include "benchmark/benchmark.h"
 #include "test/util/logging.h"
 
+extern bool FLAGS_benchmark_list_tests;
+extern std::string FLAGS_benchmark_filter;
+
 namespace gvisor {
 namespace testing {
 
@@ -26,6 +30,7 @@ void SetupGvisorDeathTest() {}
 
 void TestInit(int* argc, char*** argv) {
   ::testing::InitGoogleTest(argc, *argv);
+  benchmark::Initialize(argc, *argv);
   ::absl::ParseCommandLine(*argc, *argv);
 
   // Always mask SIGPIPE as it's common and tests aren't expected to handle it.
@@ -34,5 +39,14 @@ void TestInit(int* argc, char*** argv) {
   TEST_CHECK(sigaction(SIGPIPE, &sa, nullptr) == 0);
 }
 
+int RunAllTests() {
+  if (FLAGS_benchmark_list_tests || FLAGS_benchmark_filter != ".") {
+    benchmark::RunSpecifiedBenchmarks();
+    return 0;
+  } else {
+    return RUN_ALL_TESTS();
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 6798362dc..6f091d759 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -21,6 +21,7 @@ go_image = _go_image
 go_embed_data = _go_embed_data
 go_suffixes = _go_suffixes
 gtest = "@com_google_googletest//:gtest"
+gbenchmark = "@com_google_benchmark//:benchmark"
 loopback = "//tools/bazeldefs:loopback"
 proto_library = native.proto_library
 pkg_deb = _pkg_deb
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 39f035f12..4eece2d83 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,7 +7,7 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/bazeldefs:defs.bzl", "go_suffixes", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/bazeldefs:defs.bzl", "go_suffixes", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
 
 # Delegate directly.
 cc_binary = _cc_binary
@@ -21,6 +21,7 @@ go_image = _go_image
 go_test = _go_test
 go_tool_library = _go_tool_library
 gtest = _gtest
+gbenchmark = _gbenchmark
 pkg_deb = _pkg_deb
 pkg_tar = _pkg_tar
 py_library = _py_library
-- 
cgit v1.2.3


From 4a73bae269ae9f52a962ae3b08a17ccaacf7ba80 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 15:19:40 -0800
Subject: Initial network namespace support.

TCP/IP will work with netstack networking. hostinet doesn't work, and sockets
will have the same behavior as it is now.

Before the userspace is able to create device, the default loopback device can
be used to test.

/proc/net and /sys/net will still be connected to the root network stack; this
is the same behavior now.

Issue #1833

PiperOrigin-RevId: 296309389
---
 pkg/sentry/fs/proc/net.go                |   5 +-
 pkg/sentry/fs/proc/sys_net.go            |   4 +-
 pkg/sentry/fsimpl/proc/tasks_net.go      |   5 +-
 pkg/sentry/fsimpl/proc/tasks_sys.go      |   4 +-
 pkg/sentry/fsimpl/testutil/kernel.go     |   1 +
 pkg/sentry/inet/BUILD                    |   1 +
 pkg/sentry/inet/namespace.go             |  99 +++++++++++++++++++++++++
 pkg/sentry/kernel/kernel.go              |  26 ++++---
 pkg/sentry/kernel/task.go                |   9 +--
 pkg/sentry/kernel/task_clone.go          |  16 ++--
 pkg/sentry/kernel/task_net.go            |  19 +++--
 pkg/sentry/kernel/task_start.go          |   8 +-
 pkg/tcpip/time_unsafe.go                 |   2 +
 runsc/boot/BUILD                         |   2 +-
 runsc/boot/controller.go                 |  11 +--
 runsc/boot/loader.go                     | 121 +++++++++++++++++++++----------
 runsc/boot/network.go                    |  27 +++++++
 runsc/boot/pprof.go                      |  18 -----
 runsc/boot/pprof/BUILD                   |  11 +++
 runsc/boot/pprof/pprof.go                |  20 +++++
 runsc/sandbox/network.go                 |  25 +------
 test/syscalls/BUILD                      |   2 +
 test/syscalls/linux/BUILD                |  17 +++++
 test/syscalls/linux/network_namespace.cc | 121 +++++++++++++++++++++++++++++++
 24 files changed, 451 insertions(+), 123 deletions(-)
 create mode 100644 pkg/sentry/inet/namespace.go
 delete mode 100644 runsc/boot/pprof.go
 create mode 100644 runsc/boot/pprof/BUILD
 create mode 100644 runsc/boot/pprof/pprof.go
 create mode 100644 test/syscalls/linux/network_namespace.cc

(limited to 'test/syscalls')

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 6f2775344..95d5817ff 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -43,7 +43,10 @@ import (
 // newNet creates a new proc net entry.
 func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
-	if s := p.k.NetworkStack(); s != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process. We should make this per-process,
+	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
+	if s := p.k.RootNetworkNamespace().Stack(); s != nil {
 		contents = map[string]*fs.Inode{
 			"dev":  seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
 			"snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc),
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 0772d4ae4..d4c4b533d 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -357,7 +357,9 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine
 
 func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
-	if s := p.k.NetworkStack(); s != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process.
+	if s := p.k.RootNetworkNamespace().Stack(); s != nil {
 		contents = map[string]*fs.Inode{
 			"ipv4": p.newSysNetIPv4Dir(ctx, msrc, s),
 			"core": p.newSysNetCore(ctx, msrc, s),
diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go
index 608fec017..d4e1812d8 100644
--- a/pkg/sentry/fsimpl/proc/tasks_net.go
+++ b/pkg/sentry/fsimpl/proc/tasks_net.go
@@ -39,7 +39,10 @@ import (
 
 func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
 	var contents map[string]*kernfs.Dentry
-	if stack := k.NetworkStack(); stack != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process. We should make this per-process,
+	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
+	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
 		const (
 			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
 			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index c7ce74883..3d5dc463c 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -50,7 +50,9 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k
 func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
 	var contents map[string]*kernfs.Dentry
 
-	if stack := k.NetworkStack(); stack != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process.
+	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
 		contents = map[string]*kernfs.Dentry{
 			"ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
 				"tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}),
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index d0be32e72..488478e29 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -128,6 +128,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
 		ThreadGroup:             tc,
 		TaskContext:             &kernel.TaskContext{Name: name},
 		Credentials:             auth.CredentialsFromContext(ctx),
+		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.ApplicationCores()),
 		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
 		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 334432abf..07bf39fed 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -10,6 +10,7 @@ go_library(
     srcs = [
         "context.go",
         "inet.go",
+        "namespace.go",
         "test_stack.go",
     ],
     deps = [
diff --git a/pkg/sentry/inet/namespace.go b/pkg/sentry/inet/namespace.go
new file mode 100644
index 000000000..c16667e7f
--- /dev/null
+++ b/pkg/sentry/inet/namespace.go
@@ -0,0 +1,99 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package inet
+
+// Namespace represents a network namespace. See network_namespaces(7).
+//
+// +stateify savable
+type Namespace struct {
+	// stack is the network stack implementation of this network namespace.
+	stack Stack `state:"nosave"`
+
+	// creator allows kernel to create new network stack for network namespaces.
+	// If nil, no networking will function if network is namespaced.
+	creator NetworkStackCreator
+
+	// isRoot indicates whether this is the root network namespace.
+	isRoot bool
+}
+
+// NewRootNamespace creates the root network namespace, with creator
+// allowing new network namespaces to be created. If creator is nil, no
+// networking will function if the network is namespaced.
+func NewRootNamespace(stack Stack, creator NetworkStackCreator) *Namespace {
+	return &Namespace{
+		stack:   stack,
+		creator: creator,
+		isRoot:  true,
+	}
+}
+
+// NewNamespace creates a new network namespace from the root.
+func NewNamespace(root *Namespace) *Namespace {
+	n := &Namespace{
+		creator: root.creator,
+	}
+	n.init()
+	return n
+}
+
+// Stack returns the network stack of n. Stack may return nil if no network
+// stack is configured.
+func (n *Namespace) Stack() Stack {
+	return n.stack
+}
+
+// IsRoot returns whether n is the root network namespace.
+func (n *Namespace) IsRoot() bool {
+	return n.isRoot
+}
+
+// RestoreRootStack restores the root network namespace with stack. This should
+// only be called when restoring kernel.
+func (n *Namespace) RestoreRootStack(stack Stack) {
+	if !n.isRoot {
+		panic("RestoreRootStack can only be called on root network namespace")
+	}
+	if n.stack != nil {
+		panic("RestoreRootStack called after a stack has already been set")
+	}
+	n.stack = stack
+}
+
+func (n *Namespace) init() {
+	// Root network namespace will have stack assigned later.
+	if n.isRoot {
+		return
+	}
+	if n.creator != nil {
+		var err error
+		n.stack, err = n.creator.CreateStack()
+		if err != nil {
+			panic(err)
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (n *Namespace) afterLoad() {
+	n.init()
+}
+
+// NetworkStackCreator allows new instances of a network stack to be created. It
+// is used by the kernel to create new network namespaces when requested.
+type NetworkStackCreator interface {
+	// CreateStack creates a new network stack for a network namespace.
+	CreateStack() (Stack, error)
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 7da0368f1..c62fd6eb1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -111,7 +111,7 @@ type Kernel struct {
 	timekeeper                  *Timekeeper
 	tasks                       *TaskSet
 	rootUserNamespace           *auth.UserNamespace
-	networkStack                inet.Stack `state:"nosave"`
+	rootNetworkNamespace        *inet.Namespace
 	applicationCores            uint
 	useHostCores                bool
 	extraAuxv                   []arch.AuxEntry
@@ -260,8 +260,9 @@ type InitKernelArgs struct {
 	// RootUserNamespace is the root user namespace.
 	RootUserNamespace *auth.UserNamespace
 
-	// NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
-	NetworkStack inet.Stack
+	// RootNetworkNamespace is the root network namespace. If nil, no networking
+	// will be available.
+	RootNetworkNamespace *inet.Namespace
 
 	// ApplicationCores is the number of logical CPUs visible to sandboxed
 	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
@@ -320,7 +321,10 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.rootUTSNamespace = args.RootUTSNamespace
 	k.rootIPCNamespace = args.RootIPCNamespace
 	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
-	k.networkStack = args.NetworkStack
+	k.rootNetworkNamespace = args.RootNetworkNamespace
+	if k.rootNetworkNamespace == nil {
+		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil)
+	}
 	k.applicationCores = args.ApplicationCores
 	if args.UseHostCores {
 		k.useHostCores = true
@@ -543,8 +547,6 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
 	loadStart := time.Now()
 
-	k.networkStack = net
-
 	initAppCores := k.applicationCores
 
 	// Load the pre-saved CPUID FeatureSet.
@@ -575,6 +577,10 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
 	log.Infof("Kernel load stats: %s", &stats)
 	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
 
+	// rootNetworkNamespace should be populated after loading the state file.
+	// Restore the root network stack.
+	k.rootNetworkNamespace.RestoreRootStack(net)
+
 	// Load the memory file's state.
 	memoryStart := time.Now()
 	if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
@@ -905,6 +911,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		FSContext:               fsContext,
 		FDTable:                 args.FDTable,
 		Credentials:             args.Credentials,
+		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
@@ -1255,10 +1262,9 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
 	return k.rootAbstractSocketNamespace
 }
 
-// NetworkStack returns the network stack. NetworkStack may return nil if no
-// network stack is available.
-func (k *Kernel) NetworkStack() inet.Stack {
-	return k.networkStack
+// RootNetworkNamespace returns the root network namespace, always non-nil.
+func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
+	return k.rootNetworkNamespace
 }
 
 // GlobalInit returns the thread group with ID 1 in the root PID namespace, or
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index a3443ff21..e37e23231 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -486,13 +486,10 @@ type Task struct {
 	numaPolicy   int32
 	numaNodeMask uint64
 
-	// If netns is true, the task is in a non-root network namespace. Network
-	// namespaces aren't currently implemented in full; being in a network
-	// namespace simply prevents the task from observing any network devices
-	// (including loopback) or using abstract socket addresses (see unix(7)).
+	// netns is the task's network namespace. netns is never nil.
 	//
-	// netns is protected by mu. netns is owned by the task goroutine.
-	netns bool
+	// netns is protected by mu.
+	netns *inet.Namespace
 
 	// If rseqPreempted is true, before the next call to p.Switch(),
 	// interrupt rseq critical regions as defined by rseqAddr and
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index ba74b4c1c..78866f280 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -54,8 +55,7 @@ type SharingOptions struct {
 	NewUserNamespace bool
 
 	// If NewNetworkNamespace is true, the task should have an independent
-	// network namespace. (Note that network namespaces are not really
-	// implemented; see comment on Task.netns for details.)
+	// network namespace.
 	NewNetworkNamespace bool
 
 	// If NewFiles is true, the task should use an independent file descriptor
@@ -199,6 +199,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		ipcns = NewIPCNamespace(userns)
 	}
 
+	netns := t.NetworkNamespace()
+	if opts.NewNetworkNamespace {
+		netns = inet.NewNamespace(netns)
+	}
+
 	// TODO(b/63601033): Implement CLONE_NEWNS.
 	mntnsVFS2 := t.mountNamespaceVFS2
 	if mntnsVFS2 != nil {
@@ -268,7 +273,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		FDTable:                 fdTable,
 		Credentials:             creds,
 		Niceness:                t.Niceness(),
-		NetworkNamespaced:       t.netns,
+		NetworkNamespace:        netns,
 		AllowedCPUMask:          t.CPUMask(),
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
@@ -283,9 +288,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	} else {
 		cfg.InheritParent = t
 	}
-	if opts.NewNetworkNamespace {
-		cfg.NetworkNamespaced = true
-	}
 	nt, err := t.tg.pidns.owner.NewTask(cfg)
 	if err != nil {
 		if opts.NewThreadGroup {
@@ -482,7 +484,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 			t.mu.Unlock()
 			return syserror.EPERM
 		}
-		t.netns = true
+		t.netns = inet.NewNamespace(t.netns)
 	}
 	if opts.NewUTSNamespace {
 		if !haveCapSysAdmin {
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
index 172a31e1d..f7711232c 100644
--- a/pkg/sentry/kernel/task_net.go
+++ b/pkg/sentry/kernel/task_net.go
@@ -22,14 +22,23 @@ import (
 func (t *Task) IsNetworkNamespaced() bool {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	return t.netns
+	return !t.netns.IsRoot()
 }
 
 // NetworkContext returns the network stack used by the task. NetworkContext
 // may return nil if no network stack is available.
+//
+// TODO(gvisor.dev/issue/1833): Migrate callers of this method to
+// NetworkNamespace().
 func (t *Task) NetworkContext() inet.Stack {
-	if t.IsNetworkNamespaced() {
-		return nil
-	}
-	return t.k.networkStack
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns.Stack()
+}
+
+// NetworkNamespace returns the network namespace observed by the task.
+func (t *Task) NetworkNamespace() *inet.Namespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns
 }
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index f9236a842..a5035bb7f 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
@@ -65,9 +66,8 @@ type TaskConfig struct {
 	// Niceness is the niceness of the new task.
 	Niceness int
 
-	// If NetworkNamespaced is true, the new task should observe a non-root
-	// network namespace.
-	NetworkNamespaced bool
+	// NetworkNamespace is the network namespace to be used for the new task.
+	NetworkNamespace *inet.Namespace
 
 	// AllowedCPUMask contains the cpus that this task can run on.
 	AllowedCPUMask sched.CPUSet
@@ -133,7 +133,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		allowedCPUMask:     cfg.AllowedCPUMask.Copy(),
 		ioUsage:            &usage.IO{},
 		niceness:           cfg.Niceness,
-		netns:              cfg.NetworkNamespaced,
+		netns:              cfg.NetworkNamespace,
 		utsns:              cfg.UTSNamespace,
 		ipcns:              cfg.IPCNamespace,
 		abstractSockets:    cfg.AbstractSocketNamespace,
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 48764b978..2f98a996f 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -25,6 +25,8 @@ import (
 )
 
 // StdClock implements Clock with the time package.
+//
+// +stateify savable
 type StdClock struct{}
 
 var _ Clock = (*StdClock)(nil)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index ae4dd102a..26f68fe3d 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -19,7 +19,6 @@ go_library(
         "loader_amd64.go",
         "loader_arm64.go",
         "network.go",
-        "pprof.go",
         "strace.go",
         "user.go",
     ],
@@ -91,6 +90,7 @@ go_library(
         "//pkg/usermem",
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
+        "//runsc/boot/pprof",
         "//runsc/specutils",
         "@com_github_golang_protobuf//proto:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 9c9e94864..17e774e0c 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -142,7 +143,7 @@ func newController(fd int, l *Loader) (*controller, error) {
 	}
 	srv.Register(manager)
 
-	if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok {
+	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
 		net := &Network{
 			Stack: eps.Stack,
 		}
@@ -341,7 +342,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("creating memory file: %v", err)
 	}
 	k.SetMemoryFile(mf)
-	networkStack := cm.l.k.NetworkStack()
+	networkStack := cm.l.k.RootNetworkNamespace().Stack()
 	cm.l.k = k
 
 	// Set up the restore environment.
@@ -365,9 +366,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	}
 
 	if cm.l.conf.ProfileEnable {
-		// initializePProf opens /proc/self/maps, so has to be
-		// called before installing seccomp filters.
-		initializePProf()
+		// pprof.Initialize opens /proc/self/maps, so has to be called before
+		// installing seccomp filters.
+		pprof.Initialize()
 	}
 
 	// Seccomp filters have to be applied before parsing the state file.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index eef43b9df..e7ca98134 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -49,6 +49,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -60,6 +61,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/runsc/boot/filter"
 	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 
 	// Include supported socket providers.
@@ -230,11 +232,8 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("enabling strace: %v", err)
 	}
 
-	// Create an empty network stack because the network namespace may be empty at
-	// this point. Netns is configured before Run() is called. Netstack is
-	// configured using a control uRPC message. Host network is configured inside
-	// Run().
-	networkStack, err := newEmptyNetworkStack(args.Conf, k, k)
+	// Create root network namespace/stack.
+	netns, err := newRootNetworkNamespace(args.Conf, k, k)
 	if err != nil {
 		return nil, fmt.Errorf("creating network: %v", err)
 	}
@@ -277,7 +276,7 @@ func New(args Args) (*Loader, error) {
 		FeatureSet:                  cpuid.HostFeatureSet(),
 		Timekeeper:                  tk,
 		RootUserNamespace:           creds.UserNamespace,
-		NetworkStack:                networkStack,
+		RootNetworkNamespace:        netns,
 		ApplicationCores:            uint(args.NumCPU),
 		Vdso:                        vdso,
 		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
@@ -466,7 +465,7 @@ func (l *Loader) run() error {
 		// Delay host network configuration to this point because network namespace
 		// is configured after the loader is created and before Run() is called.
 		log.Debugf("Configuring host network")
-		stack := l.k.NetworkStack().(*hostinet.Stack)
+		stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
 		if err := stack.Configure(); err != nil {
 			return err
 		}
@@ -485,7 +484,7 @@ func (l *Loader) run() error {
 	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
 		if l.conf.ProfileEnable {
-			initializePProf()
+			pprof.Initialize()
 		}
 
 		// Finally done with all configuration. Setup filters before user code
@@ -908,48 +907,92 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
 	switch conf.Network {
 	case NetworkHost:
-		return hostinet.NewStack(), nil
+		// No network namespacing support for hostinet yet, hence creator is nil.
+		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
 
 	case NetworkNone, NetworkSandbox:
-		// NetworkNone sets up loopback using netstack.
-		netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
-		transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
-		s := netstack.Stack{stack.New(stack.Options{
-			NetworkProtocols:   netProtos,
-			TransportProtocols: transProtos,
-			Clock:              clock,
-			Stats:              netstack.Metrics,
-			HandleLocal:        true,
-			// Enable raw sockets for users with sufficient
-			// privileges.
-			RawFactory: raw.EndpointFactory{},
-			UniqueID:   uniqueID,
-		})}
-
-		// Enable SACK Recovery.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
-			return nil, fmt.Errorf("failed to enable SACK: %v", err)
+		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
+		if err != nil {
+			return nil, err
 		}
+		creator := &sandboxNetstackCreator{
+			clock:    clock,
+			uniqueID: uniqueID,
+		}
+		return inet.NewRootNamespace(s, creator), nil
 
-		// Set default TTLs as required by socket/netstack.
-		s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
-		s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	default:
+		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	}
 
-		// Enable Receive Buffer Auto-Tuning.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-			return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
-		}
+}
 
-		s.FillDefaultIPTables()
+func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	s := netstack.Stack{stack.New(stack.Options{
+		NetworkProtocols:   netProtos,
+		TransportProtocols: transProtos,
+		Clock:              clock,
+		Stats:              netstack.Metrics,
+		HandleLocal:        true,
+		// Enable raw sockets for users with sufficient
+		// privileges.
+		RawFactory: raw.EndpointFactory{},
+		UniqueID:   uniqueID,
+	})}
 
-		return &s, nil
+	// Enable SACK Recovery.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+		return nil, fmt.Errorf("failed to enable SACK: %v", err)
+	}
 
-	default:
-		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	// Set default TTLs as required by socket/netstack.
+	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+
+	// Enable Receive Buffer Auto-Tuning.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+	}
+
+	s.FillDefaultIPTables()
+
+	return &s, nil
+}
+
+// sandboxNetstackCreator implements kernel.NetworkStackCreator.
+//
+// +stateify savable
+type sandboxNetstackCreator struct {
+	clock    tcpip.Clock
+	uniqueID stack.UniqueID
+}
+
+// CreateStack implements kernel.NetworkStackCreator.CreateStack.
+func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
+	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
+	if err != nil {
+		return nil, err
 	}
+
+	// Setup loopback.
+	n := &Network{Stack: s.(*netstack.Stack).Stack}
+	nicID := tcpip.NICID(f.uniqueID.UniqueID())
+	link := DefaultLoopbackLink
+	linkEP := loopback.New()
+	if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		return nil, err
+	}
+
+	return s, nil
 }
 
 // signal sends a signal to one or more processes in a container. If PID is 0,
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 6a8765ec8..bee6ee336 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -17,6 +17,7 @@ package boot
 import (
 	"fmt"
 	"net"
+	"strings"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/log"
@@ -31,6 +32,32 @@ import (
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
+var (
+	// DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and
+	// "::1/8" on "lo" interface.
+	DefaultLoopbackLink = LoopbackLink{
+		Name: "lo",
+		Addresses: []net.IP{
+			net.IP("\x7f\x00\x00\x01"),
+			net.IPv6loopback,
+		},
+		Routes: []Route{
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv4(0x7f, 0, 0, 0),
+					Mask: net.IPv4Mask(0xff, 0, 0, 0),
+				},
+			},
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv6loopback,
+					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
+				},
+			},
+		},
+	}
+)
+
 // Network exposes methods that can be used to configure a network stack.
 type Network struct {
 	Stack *stack.Stack
diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof.go
deleted file mode 100644
index 463362f02..000000000
--- a/runsc/boot/pprof.go
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-func initializePProf() {
-}
diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD
new file mode 100644
index 000000000..29cb42b2f
--- /dev/null
+++ b/runsc/boot/pprof/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pprof",
+    srcs = ["pprof.go"],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+)
diff --git a/runsc/boot/pprof/pprof.go b/runsc/boot/pprof/pprof.go
new file mode 100644
index 000000000..1ded20dee
--- /dev/null
+++ b/runsc/boot/pprof/pprof.go
@@ -0,0 +1,20 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pprof provides a stub to initialize custom profilers.
+package pprof
+
+// Initialize will be called at boot for initializing custom profilers.
+func Initialize() {
+}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 99e143696..bc093fba5 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -21,7 +21,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strconv"
-	"strings"
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -75,30 +74,8 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 }
 
 func createDefaultLoopbackInterface(conn *urpc.Client) error {
-	link := boot.LoopbackLink{
-		Name: "lo",
-		Addresses: []net.IP{
-			net.IP("\x7f\x00\x00\x01"),
-			net.IPv6loopback,
-		},
-		Routes: []boot.Route{
-			{
-				Destination: net.IPNet{
-
-					IP:   net.IPv4(0x7f, 0, 0, 0),
-					Mask: net.IPv4Mask(0xff, 0, 0, 0),
-				},
-			},
-			{
-				Destination: net.IPNet{
-					IP:   net.IPv6loopback,
-					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
-				},
-			},
-		},
-	}
 	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
-		LoopbackLinks: []boot.LoopbackLink{link},
+		LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink},
 	}, nil); err != nil {
 		return fmt.Errorf("creating loopback link and routes: %v", err)
 	}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index d69ac8356..d1977d4de 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -258,6 +258,8 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:munmap_test")
 
+syscall_test(test = "//test/syscalls/linux:network_namespace_test")
+
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:open_create_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 05a818795..aa303af84 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3639,6 +3639,23 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "network_namespace_test",
+    testonly = 1,
+    srcs = ["network_namespace.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        gtest,
+        "//test/util:capability_util",
+        "//test/util:memory_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 cc_binary(
     name = "semaphore_test",
     testonly = 1,
diff --git a/test/syscalls/linux/network_namespace.cc b/test/syscalls/linux/network_namespace.cc
new file mode 100644
index 000000000..6ea48c263
--- /dev/null
+++ b/test/syscalls/linux/network_namespace.cc
@@ -0,0 +1,121 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <net/if.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/synchronization/notification.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using TestFunc = std::function<PosixError()>;
+using RunFunc = std::function<PosixError(TestFunc)>;
+
+struct NamespaceStrategy {
+  RunFunc run;
+
+  static NamespaceStrategy Of(RunFunc run) {
+    NamespaceStrategy s;
+    s.run = run;
+    return s;
+  }
+};
+
+PosixError RunWithUnshare(TestFunc fn) {
+  PosixError err = PosixError(-1, "function did not return a value");
+  ScopedThread t([&] {
+    if (unshare(CLONE_NEWNET) != 0) {
+      err = PosixError(errno);
+      return;
+    }
+    err = fn();
+  });
+  t.Join();
+  return err;
+}
+
+PosixError RunWithClone(TestFunc fn) {
+  struct Args {
+    absl::Notification n;
+    TestFunc fn;
+    PosixError err;
+  };
+  Args args;
+  args.fn = fn;
+  args.err = PosixError(-1, "function did not return a value");
+
+  ASSIGN_OR_RETURN_ERRNO(
+      Mapping child_stack,
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  pid_t child = clone(
+      +[](void *arg) {
+        Args *args = reinterpret_cast<Args *>(arg);
+        args->err = args->fn();
+        args->n.Notify();
+        syscall(SYS_exit, 0);  // Exit manually. No return address on stack.
+        return 0;
+      },
+      reinterpret_cast<void *>(child_stack.addr() + kPageSize),
+      CLONE_NEWNET | CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, &args);
+  if (child < 0) {
+    return PosixError(errno, "clone() failed");
+  }
+  args.n.WaitForNotification();
+  return args.err;
+}
+
+class NetworkNamespaceTest
+    : public ::testing::TestWithParam<NamespaceStrategy> {};
+
+TEST_P(NetworkNamespaceTest, LoopbackExists) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  EXPECT_NO_ERRNO(GetParam().run([]() {
+    // TODO(gvisor.dev/issue/1833): Update this to test that only "lo" exists.
+    // Check loopback device exists.
+    int sock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (sock < 0) {
+      return PosixError(errno, "socket() failed");
+    }
+    struct ifreq ifr;
+    snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+    if (ioctl(sock, SIOCGIFINDEX, &ifr) < 0) {
+      return PosixError(errno, "ioctl() failed, lo cannot be found");
+    }
+    return NoError();
+  }));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllNetworkNamespaceTest, NetworkNamespaceTest,
+    ::testing::Values(NamespaceStrategy::Of(RunWithUnshare),
+                      NamespaceStrategy::Of(RunWithClone)));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From b8f56c79be40d9c75f4e2f279c9d821d1c1c3569 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 21 Feb 2020 15:41:56 -0800
Subject: Implement tap/tun device in vfs.

PiperOrigin-RevId: 296526279
---
 pkg/abi/linux/BUILD                              |   1 +
 pkg/abi/linux/ioctl.go                           |  26 ++
 pkg/abi/linux/ioctl_tun.go                       |  29 ++
 pkg/sentry/fs/dev/BUILD                          |   5 +
 pkg/sentry/fs/dev/dev.go                         |  10 +-
 pkg/sentry/fs/dev/net_tun.go                     | 170 +++++++++++
 pkg/syserror/syserror.go                         |   1 +
 pkg/tcpip/buffer/view.go                         |   6 +
 pkg/tcpip/link/channel/BUILD                     |   1 +
 pkg/tcpip/link/channel/channel.go                | 180 +++++++++---
 pkg/tcpip/link/tun/BUILD                         |  18 +-
 pkg/tcpip/link/tun/device.go                     | 352 +++++++++++++++++++++++
 pkg/tcpip/link/tun/protocol.go                   |  56 ++++
 pkg/tcpip/stack/nic.go                           |  32 +++
 pkg/tcpip/stack/stack.go                         |  39 +++
 test/syscalls/BUILD                              |   2 +
 test/syscalls/linux/BUILD                        |  30 ++
 test/syscalls/linux/dev.cc                       |   7 +
 test/syscalls/linux/socket_netlink_route_util.cc | 163 +++++++++++
 test/syscalls/linux/socket_netlink_route_util.h  |  55 ++++
 test/syscalls/linux/tuntap.cc                    | 346 ++++++++++++++++++++++
 21 files changed, 1490 insertions(+), 39 deletions(-)
 create mode 100644 pkg/abi/linux/ioctl_tun.go
 create mode 100644 pkg/sentry/fs/dev/net_tun.go
 create mode 100644 pkg/tcpip/link/tun/device.go
 create mode 100644 pkg/tcpip/link/tun/protocol.go
 create mode 100644 test/syscalls/linux/socket_netlink_route_util.cc
 create mode 100644 test/syscalls/linux/socket_netlink_route_util.h
 create mode 100644 test/syscalls/linux/tuntap.cc

(limited to 'test/syscalls')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index a89f34d4b..322d1ccc4 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -30,6 +30,7 @@ go_library(
         "futex.go",
         "inotify.go",
         "ioctl.go",
+        "ioctl_tun.go",
         "ip.go",
         "ipc.go",
         "limits.go",
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 0e18db9ef..2062e6a4b 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -72,3 +72,29 @@ const (
 	SIOCGMIIPHY   = 0x8947
 	SIOCGMIIREG   = 0x8948
 )
+
+// ioctl(2) directions. Used to calculate requests number.
+// Constants from asm-generic/ioctl.h.
+const (
+	_IOC_NONE  = 0
+	_IOC_WRITE = 1
+	_IOC_READ  = 2
+)
+
+// Constants from asm-generic/ioctl.h.
+const (
+	_IOC_NRBITS   = 8
+	_IOC_TYPEBITS = 8
+	_IOC_SIZEBITS = 14
+	_IOC_DIRBITS  = 2
+
+	_IOC_NRSHIFT   = 0
+	_IOC_TYPESHIFT = _IOC_NRSHIFT + _IOC_NRBITS
+	_IOC_SIZESHIFT = _IOC_TYPESHIFT + _IOC_TYPEBITS
+	_IOC_DIRSHIFT  = _IOC_SIZESHIFT + _IOC_SIZEBITS
+)
+
+// IOC outputs the result of _IOC macro in asm-generic/ioctl.h.
+func IOC(dir, typ, nr, size uint32) uint32 {
+	return uint32(dir)<<_IOC_DIRSHIFT | typ<<_IOC_TYPESHIFT | nr<<_IOC_NRSHIFT | size<<_IOC_SIZESHIFT
+}
diff --git a/pkg/abi/linux/ioctl_tun.go b/pkg/abi/linux/ioctl_tun.go
new file mode 100644
index 000000000..c59c9c136
--- /dev/null
+++ b/pkg/abi/linux/ioctl_tun.go
@@ -0,0 +1,29 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// ioctl(2) request numbers from linux/if_tun.h
+var (
+	TUNSETIFF = IOC(_IOC_WRITE, 'T', 202, 4)
+	TUNGETIFF = IOC(_IOC_READ, 'T', 210, 4)
+)
+
+// Flags from net/if_tun.h
+const (
+	IFF_TUN      = 0x0001
+	IFF_TAP      = 0x0002
+	IFF_NO_PI    = 0x1000
+	IFF_NOFILTER = 0x1000
+)
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 4c4b7d5cc..9b6bb26d0 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -9,6 +9,7 @@ go_library(
         "device.go",
         "fs.go",
         "full.go",
+        "net_tun.go",
         "null.go",
         "random.go",
         "tty.go",
@@ -19,15 +20,19 @@ go_library(
         "//pkg/context",
         "//pkg/rand",
         "//pkg/safemem",
+        "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
         "//pkg/sentry/pgalloc",
+        "//pkg/sentry/socket/netstack",
         "//pkg/syserror",
+        "//pkg/tcpip/link/tun",
         "//pkg/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 35bd23991..7e66c29b0 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -66,8 +66,8 @@ func newMemDevice(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSo
 	})
 }
 
-func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	iops := ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+func newDirectory(ctx context.Context, contents map[string]*fs.Inode, msrc *fs.MountSource) *fs.Inode {
+	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(ctx, iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
 		InodeID:   devDevice.NextIno(),
@@ -111,7 +111,7 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		// A devpts is typically mounted at /dev/pts to provide
 		// pseudoterminal support. Place an empty directory there for
 		// the devpts to be mounted over.
-		"pts": newDirectory(ctx, msrc),
+		"pts": newDirectory(ctx, nil, msrc),
 		// Similarly, applications expect a ptmx device at /dev/ptmx
 		// connected to the terminals provided by /dev/pts/. Rather
 		// than creating a device directly (which requires a hairy
@@ -124,6 +124,10 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		"ptmx": newSymlink(ctx, "pts/ptmx", msrc),
 
 		"tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor),
+
+		"net": newDirectory(ctx, map[string]*fs.Inode{
+			"tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor),
+		}, msrc),
 	}
 
 	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
new file mode 100644
index 000000000..755644488
--- /dev/null
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -0,0 +1,170 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dev
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	netTunDevMajor = 10
+	netTunDevMinor = 200
+)
+
+// +stateify savable
+type netTunInodeOperations struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+}
+
+var _ fs.InodeOperations = (*netTunInodeOperations)(nil)
+
+func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *netTunInodeOperations {
+	return &netTunInodeOperations{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+	}
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil
+}
+
+// +stateify savable
+type netTunFileOperations struct {
+	fsutil.FileNoSeek               `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	device tun.Device
+}
+
+var _ fs.FileOperations = (*netTunFileOperations)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (fops *netTunFileOperations) Release() {
+	fops.device.Release()
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	request := args[1].Uint()
+	data := args[2].Pointer()
+
+	switch request {
+	case linux.TUNSETIFF:
+		t := kernel.TaskFromContext(ctx)
+		if t == nil {
+			panic("Ioctl should be called from a task context")
+		}
+		if !t.HasCapability(linux.CAP_NET_ADMIN) {
+			return 0, syserror.EPERM
+		}
+		stack, ok := t.NetworkContext().(*netstack.Stack)
+		if !ok {
+			return 0, syserror.EINVAL
+		}
+
+		var req linux.IFReq
+		if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		flags := usermem.ByteOrder.Uint16(req.Data[:])
+		return 0, fops.device.SetIff(stack.Stack, req.Name(), flags)
+
+	case linux.TUNGETIFF:
+		var req linux.IFReq
+
+		copy(req.IFName[:], fops.device.Name())
+
+		// Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
+		// there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
+		flags := fops.device.Flags() | linux.IFF_NOFILTER
+		usermem.ByteOrder.PutUint16(req.Data[:], flags)
+
+		_, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// Write implements fs.FileOperations.Write.
+func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	data := make([]byte, src.NumBytes())
+	if _, err := src.CopyIn(ctx, data); err != nil {
+		return 0, err
+	}
+	return fops.device.Write(data)
+}
+
+// Read implements fs.FileOperations.Read.
+func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	data, err := fops.device.Read()
+	if err != nil {
+		return 0, err
+	}
+	n, err := dst.CopyOut(ctx, data)
+	if n > 0 && n < len(data) {
+		// Not an error for partial copying. Packet truncated.
+		err = nil
+	}
+	return int64(n), err
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fops.device.Readiness(mask)
+}
+
+// EventRegister implements watier.Waitable.EventRegister.
+func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fops.device.EventRegister(e, mask)
+}
+
+// EventUnregister implements watier.Waitable.EventUnregister.
+func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
+	fops.device.EventUnregister(e)
+}
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 2269f6237..4b5a0fca6 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -29,6 +29,7 @@ var (
 	EACCES       = error(syscall.EACCES)
 	EAGAIN       = error(syscall.EAGAIN)
 	EBADF        = error(syscall.EBADF)
+	EBADFD       = error(syscall.EBADFD)
 	EBUSY        = error(syscall.EBUSY)
 	ECHILD       = error(syscall.ECHILD)
 	ECONNREFUSED = error(syscall.ECONNREFUSED)
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 150310c11..17e94c562 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -156,3 +156,9 @@ func (vv *VectorisedView) Append(vv2 VectorisedView) {
 	vv.views = append(vv.views, vv2.views...)
 	vv.size += vv2.size
 }
+
+// AppendView appends the given view into this vectorised view.
+func (vv *VectorisedView) AppendView(v View) {
+	vv.views = append(vv.views, v)
+	vv.size += len(v)
+}
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 3974c464e..b8b93e78e 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = ["channel.go"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 78d447acd..5944ba190 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -20,6 +20,7 @@ package channel
 import (
 	"context"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -33,6 +34,118 @@ type PacketInfo struct {
 	Route stack.Route
 }
 
+// Notification is the interface for receiving notification from the packet
+// queue.
+type Notification interface {
+	// WriteNotify will be called when a write happens to the queue.
+	WriteNotify()
+}
+
+// NotificationHandle is an opaque handle to the registered notification target.
+// It can be used to unregister the notification when no longer interested.
+//
+// +stateify savable
+type NotificationHandle struct {
+	n Notification
+}
+
+type queue struct {
+	// mu protects fields below.
+	mu sync.RWMutex
+	// c is the outbound packet channel. Sending to c should hold mu.
+	c        chan PacketInfo
+	numWrite int
+	numRead  int
+	notify   []*NotificationHandle
+}
+
+func (q *queue) Close() {
+	close(q.c)
+}
+
+func (q *queue) Read() (PacketInfo, bool) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	select {
+	case p := <-q.c:
+		q.numRead++
+		return p, true
+	default:
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) ReadContext(ctx context.Context) (PacketInfo, bool) {
+	// We have to receive from channel without holding the lock, since it can
+	// block indefinitely. This will cause a window that numWrite - numRead
+	// produces a larger number, but won't go to negative. numWrite >= numRead
+	// still holds.
+	select {
+	case pkt := <-q.c:
+		q.mu.Lock()
+		defer q.mu.Unlock()
+		q.numRead++
+		return pkt, true
+	case <-ctx.Done():
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) Write(p PacketInfo) bool {
+	wrote := false
+
+	// It's important to make sure nobody can see numWrite until we increment it,
+	// so numWrite >= numRead holds.
+	q.mu.Lock()
+	select {
+	case q.c <- p:
+		wrote = true
+		q.numWrite++
+	default:
+	}
+	notify := q.notify
+	q.mu.Unlock()
+
+	if wrote {
+		// Send notification outside of lock.
+		for _, h := range notify {
+			h.n.WriteNotify()
+		}
+	}
+	return wrote
+}
+
+func (q *queue) Num() int {
+	q.mu.RLock()
+	defer q.mu.RUnlock()
+	n := q.numWrite - q.numRead
+	if n < 0 {
+		panic("numWrite < numRead")
+	}
+	return n
+}
+
+func (q *queue) AddNotify(notify Notification) *NotificationHandle {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	h := &NotificationHandle{n: notify}
+	q.notify = append(q.notify, h)
+	return h
+}
+
+func (q *queue) RemoveNotify(handle *NotificationHandle) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	// Make a copy, since we reads the array outside of lock when notifying.
+	notify := make([]*NotificationHandle, 0, len(q.notify))
+	for _, h := range q.notify {
+		if h != handle {
+			notify = append(notify, h)
+		}
+	}
+	q.notify = notify
+}
+
 // Endpoint is link layer endpoint that stores outbound packets in a channel
 // and allows injection of inbound packets.
 type Endpoint struct {
@@ -41,14 +154,16 @@ type Endpoint struct {
 	linkAddr           tcpip.LinkAddress
 	LinkEPCapabilities stack.LinkEndpointCapabilities
 
-	// c is where outbound packets are queued.
-	c chan PacketInfo
+	// Outbound packet queue.
+	q *queue
 }
 
 // New creates a new channel endpoint.
 func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
 	return &Endpoint{
-		c:        make(chan PacketInfo, size),
+		q: &queue{
+			c: make(chan PacketInfo, size),
+		},
 		mtu:      mtu,
 		linkAddr: linkAddr,
 	}
@@ -57,43 +172,36 @@ func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
 // Close closes e. Further packet injections will panic. Reads continue to
 // succeed until all packets are read.
 func (e *Endpoint) Close() {
-	close(e.c)
+	e.q.Close()
 }
 
-// Read does non-blocking read for one packet from the outbound packet queue.
+// Read does non-blocking read one packet from the outbound packet queue.
 func (e *Endpoint) Read() (PacketInfo, bool) {
-	select {
-	case pkt := <-e.c:
-		return pkt, true
-	default:
-		return PacketInfo{}, false
-	}
+	return e.q.Read()
 }
 
 // ReadContext does blocking read for one packet from the outbound packet queue.
 // It can be cancelled by ctx, and in this case, it returns false.
 func (e *Endpoint) ReadContext(ctx context.Context) (PacketInfo, bool) {
-	select {
-	case pkt := <-e.c:
-		return pkt, true
-	case <-ctx.Done():
-		return PacketInfo{}, false
-	}
+	return e.q.ReadContext(ctx)
 }
 
 // Drain removes all outbound packets from the channel and counts them.
 func (e *Endpoint) Drain() int {
 	c := 0
 	for {
-		select {
-		case <-e.c:
-			c++
-		default:
+		if _, ok := e.Read(); !ok {
 			return c
 		}
+		c++
 	}
 }
 
+// NumQueued returns the number of packet queued for outbound.
+func (e *Endpoint) NumQueued() int {
+	return e.q.Num()
+}
+
 // InjectInbound injects an inbound packet.
 func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
@@ -155,10 +263,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 		Route: route,
 	}
 
-	select {
-	case e.c <- p:
-	default:
-	}
+	e.q.Write(p)
 
 	return nil
 }
@@ -171,7 +276,6 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 	route.Release()
 	payloadView := pkts[0].Data.ToView()
 	n := 0
-packetLoop:
 	for _, pkt := range pkts {
 		off := pkt.DataOffset
 		size := pkt.DataSize
@@ -185,12 +289,10 @@ packetLoop:
 			Route: route,
 		}
 
-		select {
-		case e.c <- p:
-			n++
-		default:
-			break packetLoop
+		if !e.q.Write(p) {
+			break
 		}
+		n++
 	}
 
 	return n, nil
@@ -204,13 +306,21 @@ func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 		GSO:   nil,
 	}
 
-	select {
-	case e.c <- p:
-	default:
-	}
+	e.q.Write(p)
 
 	return nil
 }
 
 // Wait implements stack.LinkEndpoint.Wait.
 func (*Endpoint) Wait() {}
+
+// AddNotify adds a notification target for receiving event about outgoing
+// packets.
+func (e *Endpoint) AddNotify(notify Notification) *NotificationHandle {
+	return e.q.AddNotify(notify)
+}
+
+// RemoveNotify removes handle from the list of notification targets.
+func (e *Endpoint) RemoveNotify(handle *NotificationHandle) {
+	e.q.RemoveNotify(handle)
+}
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index e5096ea38..e0db6cf54 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -4,6 +4,22 @@ package(licenses = ["notice"])
 
 go_library(
     name = "tun",
-    srcs = ["tun_unsafe.go"],
+    srcs = [
+        "device.go",
+        "protocol.go",
+        "tun_unsafe.go",
+    ],
     visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/refs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
 )
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
new file mode 100644
index 000000000..6ff47a742
--- /dev/null
+++ b/pkg/tcpip/link/tun/device.go
@@ -0,0 +1,352 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// drivers/net/tun.c:tun_net_init()
+	defaultDevMtu = 1500
+
+	// Queue length for outbound packet, arriving at fd side for read. Overflow
+	// causes packet drops. gVisor implementation-specific.
+	defaultDevOutQueueLen = 1024
+)
+
+var zeroMAC [6]byte
+
+// Device is an opened /dev/net/tun device.
+//
+// +stateify savable
+type Device struct {
+	waiter.Queue
+
+	mu           sync.RWMutex `state:"nosave"`
+	endpoint     *tunEndpoint
+	notifyHandle *channel.NotificationHandle
+	flags        uint16
+}
+
+// beforeSave is invoked by stateify.
+func (d *Device) beforeSave() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	// TODO(b/110961832): Restore the device to stack. At this moment, the stack
+	// is not savable.
+	if d.endpoint != nil {
+		panic("/dev/net/tun does not support save/restore when a device is associated with it.")
+	}
+}
+
+// Release implements fs.FileOperations.Release.
+func (d *Device) Release() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// Decrease refcount if there is an endpoint associated with this file.
+	if d.endpoint != nil {
+		d.endpoint.RemoveNotify(d.notifyHandle)
+		d.endpoint.DecRef()
+		d.endpoint = nil
+	}
+}
+
+// SetIff services TUNSETIFF ioctl(2) request.
+func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.endpoint != nil {
+		return syserror.EINVAL
+	}
+
+	// Input validations.
+	isTun := flags&linux.IFF_TUN != 0
+	isTap := flags&linux.IFF_TAP != 0
+	supportedFlags := uint16(linux.IFF_TUN | linux.IFF_TAP | linux.IFF_NO_PI)
+	if isTap && isTun || !isTap && !isTun || flags&^supportedFlags != 0 {
+		return syserror.EINVAL
+	}
+
+	prefix := "tun"
+	if isTap {
+		prefix = "tap"
+	}
+
+	endpoint, err := attachOrCreateNIC(s, name, prefix)
+	if err != nil {
+		return syserror.EINVAL
+	}
+
+	d.endpoint = endpoint
+	d.notifyHandle = d.endpoint.AddNotify(d)
+	d.flags = flags
+	return nil
+}
+
+func attachOrCreateNIC(s *stack.Stack, name, prefix string) (*tunEndpoint, error) {
+	for {
+		// 1. Try to attach to an existing NIC.
+		if name != "" {
+			if nic, found := s.GetNICByName(name); found {
+				endpoint, ok := nic.LinkEndpoint().(*tunEndpoint)
+				if !ok {
+					// Not a NIC created by tun device.
+					return nil, syserror.EOPNOTSUPP
+				}
+				if !endpoint.TryIncRef() {
+					// Race detected: NIC got deleted in between.
+					continue
+				}
+				return endpoint, nil
+			}
+		}
+
+		// 2. Creating a new NIC.
+		id := tcpip.NICID(s.UniqueID())
+		endpoint := &tunEndpoint{
+			Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""),
+			stack:    s,
+			nicID:    id,
+			name:     name,
+		}
+		if endpoint.name == "" {
+			endpoint.name = fmt.Sprintf("%s%d", prefix, id)
+		}
+		err := s.CreateNICWithOptions(endpoint.nicID, endpoint, stack.NICOptions{
+			Name: endpoint.name,
+		})
+		switch err {
+		case nil:
+			return endpoint, nil
+		case tcpip.ErrDuplicateNICID:
+			// Race detected: A NIC has been created in between.
+			continue
+		default:
+			return nil, syserror.EINVAL
+		}
+	}
+}
+
+// Write inject one inbound packet to the network interface.
+func (d *Device) Write(data []byte) (int64, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return 0, syserror.EBADFD
+	}
+	if !endpoint.IsAttached() {
+		return 0, syserror.EIO
+	}
+
+	dataLen := int64(len(data))
+
+	// Packet information.
+	var pktInfoHdr PacketInfoHeader
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		if len(data) < PacketInfoHeaderSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		pktInfoHdr = PacketInfoHeader(data[:PacketInfoHeaderSize])
+		data = data[PacketInfoHeaderSize:]
+	}
+
+	// Ethernet header (TAP only).
+	var ethHdr header.Ethernet
+	if d.hasFlags(linux.IFF_TAP) {
+		if len(data) < header.EthernetMinimumSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		ethHdr = header.Ethernet(data[:header.EthernetMinimumSize])
+		data = data[header.EthernetMinimumSize:]
+	}
+
+	// Try to determine network protocol number, default zero.
+	var protocol tcpip.NetworkProtocolNumber
+	switch {
+	case pktInfoHdr != nil:
+		protocol = pktInfoHdr.Protocol()
+	case ethHdr != nil:
+		protocol = ethHdr.Type()
+	}
+
+	// Try to determine remote link address, default zero.
+	var remote tcpip.LinkAddress
+	switch {
+	case ethHdr != nil:
+		remote = ethHdr.SourceAddress()
+	default:
+		remote = tcpip.LinkAddress(zeroMAC[:])
+	}
+
+	pkt := tcpip.PacketBuffer{
+		Data: buffer.View(data).ToVectorisedView(),
+	}
+	if ethHdr != nil {
+		pkt.LinkHeader = buffer.View(ethHdr)
+	}
+	endpoint.InjectLinkAddr(protocol, remote, pkt)
+	return dataLen, nil
+}
+
+// Read reads one outgoing packet from the network interface.
+func (d *Device) Read() ([]byte, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return nil, syserror.EBADFD
+	}
+
+	for {
+		info, ok := endpoint.Read()
+		if !ok {
+			return nil, syserror.ErrWouldBlock
+		}
+
+		v, ok := d.encodePkt(&info)
+		if !ok {
+			// Ignore unsupported packet.
+			continue
+		}
+		return v, nil
+	}
+}
+
+// encodePkt encodes packet for fd side.
+func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) {
+	var vv buffer.VectorisedView
+
+	// Packet information.
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		hdr := make(PacketInfoHeader, PacketInfoHeaderSize)
+		hdr.Encode(&PacketInfoFields{
+			Protocol: info.Proto,
+		})
+		vv.AppendView(buffer.View(hdr))
+	}
+
+	// If the packet does not already have link layer header, and the route
+	// does not exist, we can't compute it. This is possibly a raw packet, tun
+	// device doesn't support this at the moment.
+	if info.Pkt.LinkHeader == nil && info.Route.RemoteLinkAddress == "" {
+		return nil, false
+	}
+
+	// Ethernet header (TAP only).
+	if d.hasFlags(linux.IFF_TAP) {
+		// Add ethernet header if not provided.
+		if info.Pkt.LinkHeader == nil {
+			hdr := &header.EthernetFields{
+				SrcAddr: info.Route.LocalLinkAddress,
+				DstAddr: info.Route.RemoteLinkAddress,
+				Type:    info.Proto,
+			}
+			if hdr.SrcAddr == "" {
+				hdr.SrcAddr = d.endpoint.LinkAddress()
+			}
+
+			eth := make(header.Ethernet, header.EthernetMinimumSize)
+			eth.Encode(hdr)
+			vv.AppendView(buffer.View(eth))
+		} else {
+			vv.AppendView(info.Pkt.LinkHeader)
+		}
+	}
+
+	// Append upper headers.
+	vv.AppendView(buffer.View(info.Pkt.Header.View()[len(info.Pkt.LinkHeader):]))
+	// Append data payload.
+	vv.Append(info.Pkt.Data)
+
+	return vv.ToView(), true
+}
+
+// Name returns the name of the attached network interface. Empty string if
+// unattached.
+func (d *Device) Name() string {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	if d.endpoint != nil {
+		return d.endpoint.name
+	}
+	return ""
+}
+
+// Flags returns the flags set for d. Zero value if unset.
+func (d *Device) Flags() uint16 {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	return d.flags
+}
+
+func (d *Device) hasFlags(flags uint16) bool {
+	return d.flags&flags == flags
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (d *Device) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if mask&waiter.EventIn != 0 {
+		d.mu.RLock()
+		endpoint := d.endpoint
+		d.mu.RUnlock()
+		if endpoint != nil && endpoint.NumQueued() == 0 {
+			mask &= ^waiter.EventIn
+		}
+	}
+	return mask & (waiter.EventIn | waiter.EventOut)
+}
+
+// WriteNotify implements channel.Notification.WriteNotify.
+func (d *Device) WriteNotify() {
+	d.Notify(waiter.EventIn)
+}
+
+// tunEndpoint is the link endpoint for the NIC created by the tun device.
+//
+// It is ref-counted as multiple opening files can attach to the same NIC.
+// The last owner is responsible for deleting the NIC.
+type tunEndpoint struct {
+	*channel.Endpoint
+
+	refs.AtomicRefCount
+
+	stack *stack.Stack
+	nicID tcpip.NICID
+	name  string
+}
+
+// DecRef decrements refcount of e, removes NIC if refcount goes to 0.
+func (e *tunEndpoint) DecRef() {
+	e.DecRefWithDestructor(func() {
+		e.stack.RemoveNIC(e.nicID)
+	})
+}
diff --git a/pkg/tcpip/link/tun/protocol.go b/pkg/tcpip/link/tun/protocol.go
new file mode 100644
index 000000000..89d9d91a9
--- /dev/null
+++ b/pkg/tcpip/link/tun/protocol.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// PacketInfoHeaderSize is the size of the packet information header.
+	PacketInfoHeaderSize = 4
+
+	offsetFlags    = 0
+	offsetProtocol = 2
+)
+
+// PacketInfoFields contains fields sent through the wire if IFF_NO_PI flag is
+// not set.
+type PacketInfoFields struct {
+	Flags    uint16
+	Protocol tcpip.NetworkProtocolNumber
+}
+
+// PacketInfoHeader is the wire representation of the packet information sent if
+// IFF_NO_PI flag is not set.
+type PacketInfoHeader []byte
+
+// Encode encodes f into h.
+func (h PacketInfoHeader) Encode(f *PacketInfoFields) {
+	binary.BigEndian.PutUint16(h[offsetFlags:][:2], f.Flags)
+	binary.BigEndian.PutUint16(h[offsetProtocol:][:2], uint16(f.Protocol))
+}
+
+// Flags returns the flag field in h.
+func (h PacketInfoHeader) Flags() uint16 {
+	return binary.BigEndian.Uint16(h[offsetFlags:])
+}
+
+// Protocol returns the protocol field in h.
+func (h PacketInfoHeader) Protocol() tcpip.NetworkProtocolNumber {
+	return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(h[offsetProtocol:]))
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 862954ab2..46d3a6646 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -298,6 +298,33 @@ func (n *NIC) enable() *tcpip.Error {
 	return nil
 }
 
+// remove detaches NIC from the link endpoint, and marks existing referenced
+// network endpoints expired. This guarantees no packets between this NIC and
+// the network stack.
+func (n *NIC) remove() *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	// Detach from link endpoint, so no packet comes in.
+	n.linkEP.Attach(nil)
+
+	// Remove permanent and permanentTentative addresses, so no packet goes out.
+	var errs []*tcpip.Error
+	for nid, ref := range n.mu.endpoints {
+		switch ref.getKind() {
+		case permanentTentative, permanent:
+			if err := n.removePermanentAddressLocked(nid.LocalAddress); err != nil {
+				errs = append(errs, err)
+			}
+		}
+	}
+	if len(errs) > 0 {
+		return errs[0]
+	}
+
+	return nil
+}
+
 // becomeIPv6Router transitions n into an IPv6 router.
 //
 // When transitioning into an IPv6 router, host-only state (NDP discovered
@@ -1302,6 +1329,11 @@ func (n *NIC) Stack() *Stack {
 	return n.stack
 }
 
+// LinkEndpoint returns the link endpoint of n.
+func (n *NIC) LinkEndpoint() LinkEndpoint {
+	return n.linkEP
+}
+
 // isAddrTentative returns true if addr is tentative on n.
 //
 // Note that if addr is not associated with n, then this function will return
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index f0ed76fbe..900dd46c5 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -916,6 +916,18 @@ func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
 	return s.CreateNICWithOptions(id, ep, NICOptions{})
 }
 
+// GetNICByName gets the NIC specified by name.
+func (s *Stack) GetNICByName(name string) (*NIC, bool) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	for _, nic := range s.nics {
+		if nic.Name() == name {
+			return nic, true
+		}
+	}
+	return nil, false
+}
+
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
 // delivering packets to it.
 func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
@@ -956,6 +968,33 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	return nic.enabled()
 }
 
+// RemoveNIC removes NIC and all related routes from the network stack.
+func (s *Stack) RemoveNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+	delete(s.nics, id)
+
+	// Remove routes in-place. n tracks the number of routes written.
+	n := 0
+	for i, r := range s.routeTable {
+		if r.NIC != id {
+			// Keep this route.
+			if i > n {
+				s.routeTable[n] = r
+			}
+			n++
+		}
+	}
+	s.routeTable = s.routeTable[:n]
+
+	return nic.remove()
+}
+
 // NICAddressRanges returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index d1977d4de..3518e862d 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -678,6 +678,8 @@ syscall_test(
     test = "//test/syscalls/linux:truncate_test",
 )
 
+syscall_test(test = "//test/syscalls/linux:tuntap_test")
+
 syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
 syscall_test(
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index aa303af84..704bae17b 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -131,6 +131,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "socket_netlink_route_util",
+    testonly = 1,
+    srcs = ["socket_netlink_route_util.cc"],
+    hdrs = ["socket_netlink_route_util.h"],
+    deps = [
+        ":socket_netlink_util",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 cc_library(
     name = "socket_test_util",
     testonly = 1,
@@ -3430,6 +3441,25 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "tuntap_test",
+    testonly = 1,
+    srcs = ["tuntap.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        gtest,
+        "//test/syscalls/linux:socket_netlink_route_util",
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "udp_socket_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
index 4dd302eed..4e473268c 100644
--- a/test/syscalls/linux/dev.cc
+++ b/test/syscalls/linux/dev.cc
@@ -153,6 +153,13 @@ TEST(DevTest, TTYExists) {
   EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
 }
 
+TEST(DevTest, NetTunExists) {
+  struct stat statbuf = {};
+  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallSucceeds());
+  // Check that it's a character device with rw-rw-rw- permissions.
+  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
+}
+
 }  // namespace
 }  // namespace testing
 
diff --git a/test/syscalls/linux/socket_netlink_route_util.cc b/test/syscalls/linux/socket_netlink_route_util.cc
new file mode 100644
index 000000000..53eb3b6b2
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.cc
@@ -0,0 +1,163 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+
+#include <linux/if.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include "absl/types/optional.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr uint32_t kSeq = 12345;
+
+}  // namespace
+
+PosixError DumpLinks(
+    const FileDescriptor& fd, uint32_t seq,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = seq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
+}
+
+PosixErrorOr<std::vector<Link>> DumpLinks() {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  std::vector<Link> links;
+  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    if (hdr->nlmsg_type != RTM_NEWLINK ||
+        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
+      return;
+    }
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+    if (rta == nullptr) {
+      // Ignore links that do not have a name.
+      return;
+    }
+
+    links.emplace_back();
+    links.back().index = msg->ifi_index;
+    links.back().type = msg->ifi_type;
+    links.back().name =
+        std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
+  }));
+  return links;
+}
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  for (const auto& link : links) {
+    if (link.type == ARPHRD_LOOPBACK) {
+      return absl::optional<Link>(link);
+    }
+  }
+  return absl::optional<Link>();
+}
+
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifaddrmsg ifaddr;
+    char attrbuf[512];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifaddr));
+  req.hdr.nlmsg_type = RTM_NEWADDR;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifaddr.ifa_index = index;
+  req.ifaddr.ifa_family = family;
+  req.ifaddr.ifa_prefixlen = prefixlen;
+
+  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+  rta->rta_type = IFA_LOCAL;
+  rta->rta_len = RTA_LENGTH(addrlen);
+  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+  memcpy(RTA_DATA(rta), addr, addrlen);
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifinfo;
+    char pad[NLMSG_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+  req.hdr.nlmsg_type = RTM_NEWLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifinfo.ifi_index = index;
+  req.ifinfo.ifi_flags = flags;
+  req.ifinfo.ifi_change = change;
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifinfo;
+    char attrbuf[512];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+  req.hdr.nlmsg_type = RTM_NEWLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifinfo.ifi_index = index;
+
+  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+  rta->rta_type = IFLA_ADDRESS;
+  rta->rta_len = RTA_LENGTH(addrlen);
+  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+  memcpy(RTA_DATA(rta), addr, addrlen);
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_route_util.h b/test/syscalls/linux/socket_netlink_route_util.h
new file mode 100644
index 000000000..2c018e487
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.h
@@ -0,0 +1,55 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+
+struct Link {
+  int index;
+  int16_t type;
+  std::string name;
+};
+
+PosixError DumpLinks(const FileDescriptor& fd, uint32_t seq,
+                     const std::function<void(const struct nlmsghdr* hdr)>& fn);
+
+PosixErrorOr<std::vector<Link>> DumpLinks();
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink();
+
+// LinkAddLocalAddr sets IFA_LOCAL attribute on the interface.
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen);
+
+// LinkChangeFlags changes interface flags. E.g. IFF_UP.
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change);
+
+// LinkSetMacAddr sets IFLA_ADDRESS attribute of the interface.
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
new file mode 100644
index 000000000..f6ac9d7b8
--- /dev/null
+++ b/test/syscalls/linux/tuntap.cc
@@ -0,0 +1,346 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_tun.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr int kIPLen = 4;
+
+constexpr const char kDevNetTun[] = "/dev/net/tun";
+constexpr const char kTapName[] = "tap0";
+
+constexpr const uint8_t kMacA[ETH_ALEN] = {0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA};
+constexpr const uint8_t kMacB[ETH_ALEN] = {0xBB, 0xBB, 0xBB, 0xBB, 0xBB, 0xBB};
+
+PosixErrorOr<std::set<std::string>> DumpLinkNames() {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  std::set<std::string> names;
+  for (const auto& link : links) {
+    names.emplace(link.name);
+  }
+  return names;
+}
+
+PosixErrorOr<absl::optional<Link>> GetLinkByName(const std::string& name) {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  for (const auto& link : links) {
+    if (link.name == name) {
+      return absl::optional<Link>(link);
+    }
+  }
+  return absl::optional<Link>();
+}
+
+struct pihdr {
+  uint16_t pi_flags;
+  uint16_t pi_protocol;
+} __attribute__((packed));
+
+struct ping_pkt {
+  pihdr pi;
+  struct ethhdr eth;
+  struct iphdr ip;
+  struct icmphdr icmp;
+  char payload[64];
+} __attribute__((packed));
+
+ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+                          const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+  ping_pkt pkt = {};
+
+  pkt.pi.pi_protocol = htons(ETH_P_IP);
+
+  memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest));
+  memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source));
+  pkt.eth.h_proto = htons(ETH_P_IP);
+
+  pkt.ip.ihl = 5;
+  pkt.ip.version = 4;
+  pkt.ip.tos = 0;
+  pkt.ip.tot_len = htons(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+                         sizeof(pkt.payload));
+  pkt.ip.id = 1;
+  pkt.ip.frag_off = 1 << 6;  // Do not fragment
+  pkt.ip.ttl = 64;
+  pkt.ip.protocol = IPPROTO_ICMP;
+  inet_pton(AF_INET, dstip, &pkt.ip.daddr);
+  inet_pton(AF_INET, srcip, &pkt.ip.saddr);
+  pkt.ip.check = IPChecksum(pkt.ip);
+
+  pkt.icmp.type = ICMP_ECHO;
+  pkt.icmp.code = 0;
+  pkt.icmp.checksum = 0;
+  pkt.icmp.un.echo.sequence = 1;
+  pkt.icmp.un.echo.id = 1;
+
+  strncpy(pkt.payload, "abcd", sizeof(pkt.payload));
+  pkt.icmp.checksum = ICMPChecksum(pkt.icmp, pkt.payload, sizeof(pkt.payload));
+
+  return pkt;
+}
+
+struct arp_pkt {
+  pihdr pi;
+  struct ethhdr eth;
+  struct arphdr arp;
+  uint8_t arp_sha[ETH_ALEN];
+  uint8_t arp_spa[kIPLen];
+  uint8_t arp_tha[ETH_ALEN];
+  uint8_t arp_tpa[kIPLen];
+} __attribute__((packed));
+
+std::string CreateArpPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+                            const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+  std::string buffer;
+  buffer.resize(sizeof(arp_pkt));
+
+  arp_pkt* pkt = reinterpret_cast<arp_pkt*>(&buffer[0]);
+  {
+    pkt->pi.pi_protocol = htons(ETH_P_ARP);
+
+    memcpy(pkt->eth.h_dest, kMacA, sizeof(pkt->eth.h_dest));
+    memcpy(pkt->eth.h_source, kMacB, sizeof(pkt->eth.h_source));
+    pkt->eth.h_proto = htons(ETH_P_ARP);
+
+    pkt->arp.ar_hrd = htons(ARPHRD_ETHER);
+    pkt->arp.ar_pro = htons(ETH_P_IP);
+    pkt->arp.ar_hln = ETH_ALEN;
+    pkt->arp.ar_pln = kIPLen;
+    pkt->arp.ar_op = htons(ARPOP_REPLY);
+
+    memcpy(pkt->arp_sha, srcmac, sizeof(pkt->arp_sha));
+    inet_pton(AF_INET, srcip, pkt->arp_spa);
+    memcpy(pkt->arp_tha, dstmac, sizeof(pkt->arp_tha));
+    inet_pton(AF_INET, dstip, pkt->arp_tpa);
+  }
+  return buffer;
+}
+
+}  // namespace
+
+class TuntapTest : public ::testing::Test {
+ protected:
+  void TearDown() override {
+    if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))) {
+      // Bring back capability if we had dropped it in test case.
+      ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, true));
+    }
+  }
+};
+
+TEST_F(TuntapTest, CreateInterfaceNoCap) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, false));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  strncpy(ifr.ifr_name, kTapName, IFNAMSIZ);
+
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallFailsWithErrno(EPERM));
+}
+
+TEST_F(TuntapTest, CreateFixedNameInterface) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr_set = {};
+  ifr_set.ifr_flags = IFF_TAP;
+  strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
+              SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_get = {};
+  EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+              SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_expect = ifr_set;
+  // See __tun_chr_ioctl() in net/drivers/tun.c.
+  ifr_expect.ifr_flags |= IFF_NOFILTER;
+
+  EXPECT_THAT(DumpLinkNames(),
+              IsPosixErrorOkAndHolds(::testing::Contains(kTapName)));
+  EXPECT_THAT(memcmp(&ifr_expect, &ifr_get, sizeof(ifr_get)), ::testing::Eq(0));
+}
+
+TEST_F(TuntapTest, CreateInterface) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  // Empty ifr.ifr_name. Let kernel assign.
+
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_get = {};
+  EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+              SyscallSucceedsWithValue(0));
+
+  std::string ifname = ifr_get.ifr_name;
+  EXPECT_THAT(ifname, ::testing::StartsWith("tap"));
+  EXPECT_THAT(DumpLinkNames(),
+              IsPosixErrorOkAndHolds(::testing::Contains(ifname)));
+}
+
+TEST_F(TuntapTest, InvalidReadWrite) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  char buf[128] = {};
+  EXPECT_THAT(read(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+  EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+}
+
+TEST_F(TuntapTest, WriteToDownDevice) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // FIXME: gVisor always creates enabled/up'd interfaces.
+  SKIP_IF(IsRunningOnGvisor());
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  // Device created should be down by default.
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+  char buf[128] = {};
+  EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EIO));
+}
+
+// This test sets up a TAP device and pings kernel by sending ICMP echo request.
+//
+// It works as the following:
+// * Open /dev/net/tun, and create kTapName interface.
+// * Use rtnetlink to do initial setup of the interface:
+//   * Assign IP address 10.0.0.1/24 to kernel.
+//   * MAC address: kMacA
+//   * Bring up the interface.
+// * Send an ICMP echo reqest (ping) packet from 10.0.0.2 (kMacB) to kernel.
+// * Loop to receive packets from TAP device/fd:
+//   * If packet is an ICMP echo reply, it stops and passes the test.
+//   * If packet is an ARP request, it responds with canned reply and resends
+//   the
+//     ICMP request packet.
+TEST_F(TuntapTest, PingKernel) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // Interface creation.
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr_set = {};
+  ifr_set.ifr_flags = IFF_TAP;
+  strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
+              SyscallSucceedsWithValue(0));
+
+  absl::optional<Link> link =
+      ASSERT_NO_ERRNO_AND_VALUE(GetLinkByName(kTapName));
+  ASSERT_TRUE(link.has_value());
+
+  // Interface setup.
+  struct in_addr addr;
+  inet_pton(AF_INET, "10.0.0.1", &addr);
+  EXPECT_NO_ERRNO(LinkAddLocalAddr(link->index, AF_INET, /*prefixlen=*/24,
+                                   &addr, sizeof(addr)));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME: gVisor doesn't support setting MAC address on interfaces yet.
+    EXPECT_NO_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
+
+    // FIXME: gVisor always creates enabled/up'd interfaces.
+    EXPECT_NO_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
+  }
+
+  ping_pkt ping_req = CreatePingPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+  std::string arp_rep = CreateArpPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+
+  // Send ping, this would trigger an ARP request on Linux.
+  EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+              SyscallSucceedsWithValue(sizeof(ping_req)));
+
+  // Receive loop to process inbound packets.
+  struct inpkt {
+    union {
+      pihdr pi;
+      ping_pkt ping;
+      arp_pkt arp;
+    };
+  };
+  while (1) {
+    inpkt r = {};
+    int n = read(fd.get(), &r, sizeof(r));
+    EXPECT_THAT(n, SyscallSucceeds());
+
+    if (n < sizeof(pihdr)) {
+      std::cerr << "Ignored packet, protocol: " << r.pi.pi_protocol
+                << " len: " << n << std::endl;
+      continue;
+    }
+
+    // Process ARP packet.
+    if (n >= sizeof(arp_pkt) && r.pi.pi_protocol == htons(ETH_P_ARP)) {
+      // Respond with canned ARP reply.
+      EXPECT_THAT(write(fd.get(), arp_rep.data(), arp_rep.size()),
+                  SyscallSucceedsWithValue(arp_rep.size()));
+      // First ping request might have been dropped due to mac address not in
+      // ARP cache. Send it again.
+      EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+                  SyscallSucceedsWithValue(sizeof(ping_req)));
+    }
+
+    // Process ping response packet.
+    if (n >= sizeof(ping_pkt) && r.pi.pi_protocol == ping_req.pi.pi_protocol &&
+        r.ping.ip.protocol == ping_req.ip.protocol &&
+        !memcmp(&r.ping.ip.saddr, &ping_req.ip.daddr, kIPLen) &&
+        !memcmp(&r.ping.ip.daddr, &ping_req.ip.saddr, kIPLen) &&
+        r.ping.icmp.type == 0 && r.ping.icmp.code == 0) {
+      // Ends and passes the test.
+      break;
+    }
+  }
+}
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 75d7f76a6cd81d77f5ce70440c1d95c0296b15ba Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Mon, 11 Nov 2019 20:26:38 -0800
Subject: arm64: add a travis build ci

Build runsc and run "runsc do ls".

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 .travis.yml                                  | 19 ++++++++++++++++++
 Dockerfile                                   | 11 ++++++-----
 Makefile                                     |  5 ++++-
 test/syscalls/linux/32bit.cc                 |  2 +-
 test/syscalls/linux/rseq/uapi.h              | 29 ++++++++++++----------------
 test/syscalls/linux/udp_socket_test_cases.cc |  4 ++++
 6 files changed, 46 insertions(+), 24 deletions(-)

(limited to 'test/syscalls')

diff --git a/.travis.yml b/.travis.yml
index e69de29bb..a2a260538 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -0,0 +1,19 @@
+language: minimal
+sudo: required
+dist: xenial
+cache:
+  directories:
+    - /home/travis/.cache/bazel/
+services:
+  - docker
+matrix:
+  include:
+   - os: linux
+     arch: amd64
+     env: RUNSC_PATH=./bazel-bin/runsc/linux_amd64_pure_stripped/runsc
+   - os: linux
+     arch: arm64
+     env: RUNSC_PATH=./bazel-bin/runsc/linux_arm64_pure_stripped/runsc
+script:
+   - uname -a
+   - make DOCKER_RUN_OPTIONS="" BAZEL_OPTIONS="build runsc:runsc" bazel && $RUNSC_PATH --alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do ls
diff --git a/Dockerfile b/Dockerfile
index 738623023..2bfdfec6c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,9 @@
-FROM ubuntu:bionic
+FROM fedora:31
 
-RUN apt-get update && apt-get install -y curl gnupg2 git python python3 python3-distutils python3-pip
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
-RUN apt-get update && apt-get install -y bazel && apt-get clean
+RUN  dnf install -y dnf-plugins-core && dnf copr enable -y vbatts/bazel
+
+RUN dnf install -y bazel2 git gcc make golang gcc-c++ glibc-devel python3 which python3-pip python3-devel libffi-devel openssl-devel pkg-config glibc-static
+
+RUN pip install pycparser
 
 WORKDIR /gvisor
diff --git a/Makefile b/Makefile
index a73bc0c36..d9531fbd5 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,9 @@ UID := $(shell id -u ${USER})
 GID := $(shell id -g ${USER})
 GVISOR_BAZEL_CACHE := $(shell readlink -f ~/.cache/bazel/)
 
+# The  --privileged is required to run tests.
+DOCKER_RUN_OPTIONS ?= --privileged
+
 all: runsc
 
 docker-build:
@@ -19,7 +22,7 @@ bazel-server-start: docker-build
 		-v "$(CURDIR):$(CURDIR)" \
 		--workdir "$(CURDIR)" \
 		--tmpfs /tmp:rw,exec \
-		--privileged \
+		$(DOCKER_RUN_OPTIONS) \
 		gvisor-bazel \
 		sh -c "while :; do sleep 100; done" && \
 	docker exec --user 0:0 -i gvisor-bazel sh -c "groupadd --gid $(GID) --non-unique gvisor && useradd --uid $(UID) --non-unique --gid $(GID) -d $(HOME) gvisor"
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index c47a05181..3c825477c 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -74,7 +74,7 @@ void ExitGroup32(const char instruction[2], int code) {
       "int $3\n"
       :
       : [ code ] "m"(code), [ ip ] "d"(m.ptr())
-      : "rax", "rbx", "rsp");
+      : "rax", "rbx");
 }
 
 constexpr int kExitCode = 42;
diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h
index e3ff0579a..ca1d67691 100644
--- a/test/syscalls/linux/rseq/uapi.h
+++ b/test/syscalls/linux/rseq/uapi.h
@@ -15,14 +15,9 @@
 #ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
 #define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
 
-// User-kernel ABI for restartable sequences.
+#include <stdint.h>
 
-// Standard types.
-//
-// N.B. This header will be included in targets that do have the standard
-// library, so we can't shadow the standard type names.
-using __u32 = __UINT32_TYPE__;
-using __u64 = __UINT64_TYPE__;
+// User-kernel ABI for restartable sequences.
 
 #ifdef __x86_64__
 // Syscall numbers.
@@ -32,20 +27,20 @@ constexpr int kRseqSyscall = 334;
 #endif  // __x86_64__
 
 struct rseq_cs {
-  __u32 version;
-  __u32 flags;
-  __u64 start_ip;
-  __u64 post_commit_offset;
-  __u64 abort_ip;
-} __attribute__((aligned(4 * sizeof(__u64))));
+  uint32_t version;
+  uint32_t flags;
+  uint64_t start_ip;
+  uint64_t post_commit_offset;
+  uint64_t abort_ip;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
 
 // N.B. alignment is enforced by the kernel.
 struct rseq {
-  __u32 cpu_id_start;
-  __u32 cpu_id;
+  uint32_t cpu_id_start;
+  uint32_t cpu_id;
   struct rseq_cs* rseq_cs;
-  __u32 flags;
-} __attribute__((aligned(4 * sizeof(__u64))));
+  uint32_t flags;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
 
 constexpr int kRseqFlagUnregister = 1 << 0;
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 57b1a357c..740c7986d 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -21,6 +21,10 @@
 #include <sys/socket.h>
 #include <sys/types.h>
 
+#ifndef SIOCGSTAMP
+#include <linux/sockios.h>
+#endif
+
 #include "gtest/gtest.h"
 #include "absl/base/macros.h"
 #include "absl/time/clock.h"
-- 
cgit v1.2.3


From de0b2ebf8635a75bfabfd0a8b48de7923017574e Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Wed, 26 Feb 2020 18:16:19 -0800
Subject: Add getsockopt tests for SO_SNDTIMEO and SO_RCVTIMEO

PiperOrigin-RevId: 297485310
---
 test/syscalls/linux/socket_generic.cc | 96 ++++++++++++++++++++++++++++++++---
 1 file changed, 88 insertions(+), 8 deletions(-)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index e8f24a59e..f7d6139f1 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -447,6 +447,60 @@ TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgSucceeds) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
+TEST_P(AllSocketPairTest, SendTimeoutDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval actual_tv = {.tv_sec = -1, .tv_usec = -1};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv_usec, 0);
+}
+
+TEST_P(AllSocketPairTest, SetGetSendTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval tv = {.tv_sec = 89, .tv_usec = 42000};
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  timeval actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 89);
+  EXPECT_EQ(actual_tv.tv_usec, 42000);
+}
+
+TEST_P(AllSocketPairTest, SetGetSendTimeoutLargerArg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval_with_extra {
+    struct timeval tv;
+    int64_t extra_data;
+  } ABSL_ATTRIBUTE_PACKED;
+
+  timeval_with_extra tv_extra = {
+      .tv = {.tv_sec = 0, .tv_usec = 123000},
+  };
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &tv_extra, sizeof(tv_extra)),
+              SyscallSucceeds());
+
+  timeval_with_extra actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv.tv_usec, 123000);
+}
+
 TEST_P(AllSocketPairTest, SendTimeoutAllowsWrite) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -491,18 +545,36 @@ TEST_P(AllSocketPairTest, SendTimeoutAllowsSendmsg) {
   ASSERT_NO_FATAL_FAILURE(SendNullCmsg(sockets->first_fd(), buf, sizeof(buf)));
 }
 
-TEST_P(AllSocketPairTest, SoRcvTimeoIsSet) {
+TEST_P(AllSocketPairTest, RecvTimeoutDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
-  struct timeval tv {
-    .tv_sec = 0, .tv_usec = 35
-  };
+  timeval actual_tv = {.tv_sec = -1, .tv_usec = -1};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv_usec, 0);
+}
+
+TEST_P(AllSocketPairTest, SetGetRecvTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval tv = {.tv_sec = 123, .tv_usec = 456000};
   EXPECT_THAT(
       setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
       SyscallSucceeds());
+
+  timeval actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 123);
+  EXPECT_EQ(actual_tv.tv_usec, 456000);
 }
 
-TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
+TEST_P(AllSocketPairTest, SetGetRecvTimeoutLargerArg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct timeval_with_extra {
@@ -510,13 +582,21 @@ TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
     int64_t extra_data;
   } ABSL_ATTRIBUTE_PACKED;
 
-  timeval_with_extra tv_extra;
-  tv_extra.tv.tv_sec = 0;
-  tv_extra.tv.tv_usec = 25;
+  timeval_with_extra tv_extra = {
+      .tv = {.tv_sec = 0, .tv_usec = 432000},
+  };
 
   EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
                          &tv_extra, sizeof(tv_extra)),
               SyscallSucceeds());
+
+  timeval_with_extra actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv.tv_usec, 432000);
 }
 
 TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgOneSecondSucceeds) {
-- 
cgit v1.2.3


From abf7ebcd38e8c2750f4542f29115140bb2b44a9b Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Thu, 27 Feb 2020 10:59:32 -0800
Subject: Internal change.

PiperOrigin-RevId: 297638665
---
 pkg/sentry/socket/netstack/netstack.go |  40 +++++++++--
 pkg/tcpip/transport/packet/endpoint.go |  21 +++++-
 test/syscalls/linux/packet_socket.cc   | 124 ++++++++++++++++++++++++++++++---
 3 files changed, 167 insertions(+), 18 deletions(-)

(limited to 'test/syscalls')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index e187276c5..48c268bfa 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -712,14 +712,40 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	addr, family, err := AddressAndFamily(sockaddr)
-	if err != nil {
-		return err
-	}
-	if err := s.checkFamily(family, true /* exact */); err != nil {
-		return err
+	family := usermem.ByteOrder.Uint16(sockaddr)
+	var addr tcpip.FullAddress
+
+	// Bind for AF_PACKET requires only family, protocol and ifindex.
+	// In function AddressAndFamily, we check the address length which is
+	// not needed for AF_PACKET bind.
+	if family == linux.AF_PACKET {
+		var a linux.SockAddrLink
+		if len(sockaddr) < sockAddrLinkSize {
+			return syserr.ErrInvalidArgument
+		}
+		binary.Unmarshal(sockaddr[:sockAddrLinkSize], usermem.ByteOrder, &a)
+
+		if a.Protocol != uint16(s.protocol) {
+			return syserr.ErrInvalidArgument
+		}
+
+		addr = tcpip.FullAddress{
+			NIC:  tcpip.NICID(a.InterfaceIndex),
+			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
+		}
+	} else {
+		var err *syserr.Error
+		addr, family, err = AddressAndFamily(sockaddr)
+		if err != nil {
+			return err
+		}
+
+		if err = s.checkFamily(family, true /* exact */); err != nil {
+			return err
+		}
+
+		addr = s.mapFamily(addr, family)
 	}
-	addr = s.mapFamily(addr, family)
 
 	// Issue the bind request to the endpoint.
 	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 5722815e9..09a1cd436 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -76,6 +76,7 @@ type endpoint struct {
 	sndBufSize int
 	closed     bool
 	stats      tcpip.TransportEndpointStats `state:"nosave"`
+	bound      bool
 }
 
 // NewEndpoint returns a new packet endpoint.
@@ -125,6 +126,7 @@ func (ep *endpoint) Close() {
 	}
 
 	ep.closed = true
+	ep.bound = false
 	ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 }
 
@@ -216,7 +218,24 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 	// sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex."
 	// - packet(7).
 
-	return tcpip.ErrNotSupported
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	if ep.bound {
+		return tcpip.ErrAlreadyBound
+	}
+
+	// Unregister endpoint with all the nics.
+	ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
+
+	// Bind endpoint to receive packets from specific interface.
+	if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil {
+		return err
+	}
+
+	ep.bound = true
+
+	return nil
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index 92ae55eec..bc22de788 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <arpa/inet.h>
+#include <ifaddrs.h>
 #include <linux/capability.h>
 #include <linux/if_arp.h>
 #include <linux/if_packet.h>
@@ -163,16 +164,11 @@ int CookedPacketTest::GetLoopbackIndex() {
   return ifr.ifr_ifindex;
 }
 
-// Receive via a packet socket.
-TEST_P(CookedPacketTest, Receive) {
-  // Let's use a simple IP payload: a UDP datagram.
-  FileDescriptor udp_sock =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
-  SendUDPMessage(udp_sock.get());
-
+// Receive and verify the message via packet socket on interface.
+void ReceiveMessage(int sock, int ifindex) {
   // Wait for the socket to become readable.
   struct pollfd pfd = {};
-  pfd.fd = socket_;
+  pfd.fd = sock;
   pfd.events = POLLIN;
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 2000), SyscallSucceedsWithValue(1));
 
@@ -182,9 +178,10 @@ TEST_P(CookedPacketTest, Receive) {
   char buf[64];
   struct sockaddr_ll src = {};
   socklen_t src_len = sizeof(src);
-  ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0,
+  ASSERT_THAT(recvfrom(sock, buf, sizeof(buf), 0,
                        reinterpret_cast<struct sockaddr*>(&src), &src_len),
               SyscallSucceedsWithValue(packet_size));
+
   // sockaddr_ll ends with an 8 byte physical address field, but ethernet
   // addresses only use 6 bytes.  Linux used to return sizeof(sockaddr_ll)-2
   // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns
@@ -194,7 +191,7 @@ TEST_P(CookedPacketTest, Receive) {
   // TODO(b/129292371): Verify protocol once we return it.
   // Verify the source address.
   EXPECT_EQ(src.sll_family, AF_PACKET);
-  EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex());
+  EXPECT_EQ(src.sll_ifindex, ifindex);
   EXPECT_EQ(src.sll_halen, ETH_ALEN);
   // This came from the loopback device, so the address is all 0s.
   for (int i = 0; i < src.sll_halen; i++) {
@@ -222,6 +219,18 @@ TEST_P(CookedPacketTest, Receive) {
   EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0);
 }
 
+// Receive via a packet socket.
+TEST_P(CookedPacketTest, Receive) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  int loopback_index = GetLoopbackIndex();
+  ReceiveMessage(socket_, loopback_index);
+}
+
 // Send via a packet socket.
 TEST_P(CookedPacketTest, Send) {
   // TODO(b/129292371): Remove once we support packet socket writing.
@@ -313,6 +322,101 @@ TEST_P(CookedPacketTest, Send) {
   EXPECT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK));
 }
 
+// Bind and receive via packet socket.
+TEST_P(CookedPacketTest, BindReceive) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  ReceiveMessage(socket_, bind_addr.sll_ifindex);
+}
+
+// Double Bind socket.
+TEST_P(CookedPacketTest, DoubleBind) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Binding socket again should fail.
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+           sizeof(bind_addr)),
+      // Linux 4.09 returns EINVAL here, but some time before 4.19 it switched
+      // to EADDRINUSE.
+      AnyOf(SyscallFailsWithErrno(EADDRINUSE), SyscallFailsWithErrno(EINVAL)));
+}
+
+// Bind and verify we do not receive data on interface which is not bound
+TEST_P(CookedPacketTest, BindDrop) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  struct ifaddrs* if_addr_list = nullptr;
+  auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); });
+
+  ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds());
+
+  // Get interface other than loopback.
+  struct ifreq ifr = {};
+  for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) {
+    if (strcmp(i->ifa_name, "lo") != 0) {
+      strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name));
+      break;
+    }
+  }
+
+  // Skip if no interface is available other than loopback.
+  if (strlen(ifr.ifr_name) == 0) {
+    GTEST_SKIP();
+  }
+
+  // Get interface index.
+  EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  EXPECT_NE(ifr.ifr_ifindex, 0);
+
+  // Bind to packet socket requires only family, protocol and ifindex.
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = ifr.ifr_ifindex;
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Send to loopback interface.
+  struct sockaddr_in dest = {};
+  dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  dest.sin_family = AF_INET;
+  dest.sin_port = kPort;
+  EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0,
+                     reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
+              SyscallSucceedsWithValue(sizeof(kMessage)));
+
+  // Wait and make sure the socket never receives any data.
+  struct pollfd pfd = {};
+  pfd.fd = socket_;
+  pfd.events = POLLIN;
+  EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
-- 
cgit v1.2.3


From dd1ed5c789ff72fd6bbacda0ff7c7acf9672d25a Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Fri, 28 Feb 2020 14:47:34 +0800
Subject: skip vsyscall test cases on Arm64

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/syscalls/linux/time.cc     | 2 ++
 test/syscalls/linux/vsyscall.cc | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index 1ccb95733..e75bba669 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -26,6 +26,7 @@ namespace {
 
 constexpr long kFudgeSeconds = 5;
 
+#if defined(__x86_64__) || defined(__i386__)
 // Mimics the time(2) wrapper from glibc prior to 2.15.
 time_t vsyscall_time(time_t* t) {
   constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
@@ -98,6 +99,7 @@ TEST(TimeTest, VsyscallGettimeofday_InvalidAddressSIGSEGV) {
                                     reinterpret_cast<struct timezone*>(0x1)),
               ::testing::KilledBySignal(SIGSEGV), "");
 }
+#endif
 
 }  // namespace
 
diff --git a/test/syscalls/linux/vsyscall.cc b/test/syscalls/linux/vsyscall.cc
index 2c2303358..ae4377108 100644
--- a/test/syscalls/linux/vsyscall.cc
+++ b/test/syscalls/linux/vsyscall.cc
@@ -24,6 +24,7 @@ namespace testing {
 
 namespace {
 
+#if defined(__x86_64__) || defined(__i386__)
 time_t vsyscall_time(time_t* t) {
   constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
   return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
@@ -37,6 +38,7 @@ TEST(VsyscallTest, VsyscallAlwaysAvailableOnGvisor) {
   time_t t;
   EXPECT_THAT(vsyscall_time(&t), SyscallSucceeds());
 }
+#endif
 
 }  // namespace
 
-- 
cgit v1.2.3


From 6b4d36e3253238dd72d0861ac1220d147e1de8dd Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 28 Feb 2020 10:37:52 -0800
Subject: Hide /dev/net/tun when using hostinet.

/dev/net/tun does not currently work with hostinet. This has caused some
program starts failing because it thinks the feature exists.

PiperOrigin-RevId: 297876196
---
 pkg/sentry/fs/dev/BUILD                |  1 +
 pkg/sentry/fs/dev/dev.go               |  7 +++++--
 pkg/sentry/fs/dev/net_tun.go           |  7 +++++++
 pkg/sentry/kernel/kernel.go            |  4 ++++
 test/syscalls/BUILD                    |  5 +++++
 test/syscalls/linux/BUILD              | 12 +++++++++++
 test/syscalls/linux/dev.cc             |  7 -------
 test/syscalls/linux/tuntap.cc          |  7 +++++++
 test/syscalls/linux/tuntap_hostinet.cc | 37 ++++++++++++++++++++++++++++++++++
 9 files changed, 78 insertions(+), 9 deletions(-)
 create mode 100644 test/syscalls/linux/tuntap_hostinet.cc

(limited to 'test/syscalls')

diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 9b6bb26d0..9379a4d7b 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -26,6 +26,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 7e66c29b0..acbd401a0 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -124,10 +125,12 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		"ptmx": newSymlink(ctx, "pts/ptmx", msrc),
 
 		"tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor),
+	}
 
-		"net": newDirectory(ctx, map[string]*fs.Inode{
+	if isNetTunSupported(inet.StackFromContext(ctx)) {
+		contents["net"] = newDirectory(ctx, map[string]*fs.Inode{
 			"tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor),
-		}, msrc),
+		}, msrc)
 	}
 
 	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
index 755644488..dc7ad075a 100644
--- a/pkg/sentry/fs/dev/net_tun.go
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -168,3 +169,9 @@ func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.Eve
 func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
 	fops.device.EventUnregister(e)
 }
+
+// isNetTunSupported returns whether /dev/net/tun device is supported for s.
+func isNetTunSupported(s inet.Stack) bool {
+	_, ok := s.(*netstack.Stack)
+	return ok
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 8b76750e9..1d627564f 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -755,6 +755,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
@@ -1481,6 +1483,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 3518e862d..a69b0ce13 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -680,6 +680,11 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:tuntap_test")
 
+syscall_test(
+    add_hostinet = True,
+    test = "//test/syscalls/linux:tuntap_hostinet_test",
+)
+
 syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
 syscall_test(
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 704bae17b..70c120e42 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3460,6 +3460,18 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "tuntap_hostinet_test",
+    testonly = 1,
+    srcs = ["tuntap_hostinet.cc"],
+    linkstatic = 1,
+    deps = [
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_library(
     name = "udp_socket_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
index 4e473268c..4dd302eed 100644
--- a/test/syscalls/linux/dev.cc
+++ b/test/syscalls/linux/dev.cc
@@ -153,13 +153,6 @@ TEST(DevTest, TTYExists) {
   EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
 }
 
-TEST(DevTest, NetTunExists) {
-  struct stat statbuf = {};
-  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallSucceeds());
-  // Check that it's a character device with rw-rw-rw- permissions.
-  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
-}
-
 }  // namespace
 }  // namespace testing
 
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index f6ac9d7b8..f734511d6 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -153,6 +153,13 @@ std::string CreateArpPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
 
 }  // namespace
 
+TEST(TuntapStaticTest, NetTunExists) {
+  struct stat statbuf;
+  ASSERT_THAT(stat(kDevNetTun, &statbuf), SyscallSucceeds());
+  // Check that it's a character device with rw-rw-rw- permissions.
+  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
+}
+
 class TuntapTest : public ::testing::Test {
  protected:
   void TearDown() override {
diff --git a/test/syscalls/linux/tuntap_hostinet.cc b/test/syscalls/linux/tuntap_hostinet.cc
new file mode 100644
index 000000000..0c527419e
--- /dev/null
+++ b/test/syscalls/linux/tuntap_hostinet.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(TuntapHostInetTest, NoNetTun) {
+  SKIP_IF(!IsRunningOnGvisor());
+
+  struct stat statbuf;
+  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallFailsWithErrno(ENOENT));
+}
+
+}  // namespace
+}  // namespace testing
+
+}  // namespace gvisor
-- 
cgit v1.2.3


From 36b193b1db60cad3c1c65ce3abef03a6a0594e3e Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Mon, 2 Mar 2020 07:13:47 +0000
Subject: Fix syscall test build error on arm64.

The error was introduced in the merge of PR #1471.
Some codes are missing when adding bazel select_arch
command to the test/syscall/linux/BUILD file.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I8cae3f4ae78c2e14671f3ac6e7361dc2806d9305
---
 test/syscalls/linux/BUILD | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 70c120e42..9ab13ba07 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -608,7 +608,10 @@ cc_binary(
 cc_binary(
     name = "exceptions_test",
     testonly = 1,
-    srcs = ["exceptions.cc"],
+    srcs = select_arch(
+	amd64 = ["exceptions.cc"],
+        arm64 = [],
+    ),
     linkstatic = 1,
     deps = [
         gtest,
@@ -1475,7 +1478,10 @@ cc_binary(
 cc_binary(
     name = "arch_prctl_test",
     testonly = 1,
-    srcs = ["arch_prctl.cc"],
+    srcs = select_arch(
+        amd64 = ["arch_prctl.cc"],
+        arm64 = [],
+    ),
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
@@ -3322,7 +3328,10 @@ cc_binary(
 cc_binary(
     name = "sysret_test",
     testonly = 1,
-    srcs = ["sysret.cc"],
+    srcs = select_arch(
+        amd64 = ["sysret.cc"],
+        arm64 = [],
+    ),
     linkstatic = 1,
     deps = [
         gtest,
-- 
cgit v1.2.3


From 33101752501fafea99d77f34bbd65f3e0083d22e Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 2 Mar 2020 14:43:52 -0800
Subject: Fix data-race when reading/writing e.amss.

PiperOrigin-RevId: 298451319
---
 pkg/tcpip/transport/tcp/connect.go  | 11 +++++++++--
 pkg/tcpip/transport/tcp/endpoint.go | 29 ++++++++++++++++++-----------
 test/syscalls/linux/tcp_socket.cc   | 15 +++++++++++++++
 3 files changed, 42 insertions(+), 13 deletions(-)

(limited to 'test/syscalls')

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index cd247f3e1..ae4f3f3a9 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -295,6 +295,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	h.state = handshakeSynRcvd
 	h.ep.mu.Lock()
 	ttl := h.ep.ttl
+	amss := h.ep.amss
 	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
 	synOpts := header.TCPSynOptions{
@@ -307,7 +308,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 		// permits SACK. This is not explicitly defined in the RFC but
 		// this is the behaviour implemented by Linux.
 		SACKPermitted: rcvSynOpts.SACKPermitted,
-		MSS:           h.ep.amss,
+		MSS:           amss,
 	}
 	if ttl == 0 {
 		ttl = s.route.DefaultTTL()
@@ -356,6 +357,10 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			return tcpip.ErrInvalidEndpointState
 		}
 
+		h.ep.mu.RLock()
+		amss := h.ep.amss
+		h.ep.mu.RUnlock()
+
 		h.resetState()
 		synOpts := header.TCPSynOptions{
 			WS:            h.rcvWndScale,
@@ -363,7 +368,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			TSVal:         h.ep.timestamp(),
 			TSEcr:         h.ep.recentTimestamp(),
 			SACKPermitted: h.ep.sackPermitted,
-			MSS:           h.ep.amss,
+			MSS:           amss,
 		}
 		h.ep.sendSynTCP(&s.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
 		return nil
@@ -530,6 +535,7 @@ func (h *handshake) execute() *tcpip.Error {
 
 	// Send the initial SYN segment and loop until the handshake is
 	// completed.
+	h.ep.mu.Lock()
 	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
 
 	synOpts := header.TCPSynOptions{
@@ -540,6 +546,7 @@ func (h *handshake) execute() *tcpip.Error {
 		SACKPermitted: bool(sackEnabled),
 		MSS:           h.ep.amss,
 	}
+	h.ep.mu.Unlock()
 
 	// Execute is also called in a listen context so we want to make sure we
 	// only send the TS/SACK option when we received the TS/SACK in the
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9e72730bd..8b9154e69 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -959,15 +959,18 @@ func (e *endpoint) initialReceiveWindow() int {
 // ModerateRecvBuf adjusts the receive buffer and the advertised window
 // based on the number of bytes copied to user space.
 func (e *endpoint) ModerateRecvBuf(copied int) {
+	e.mu.RLock()
 	e.rcvListMu.Lock()
 	if e.rcvAutoParams.disabled {
 		e.rcvListMu.Unlock()
+		e.mu.RUnlock()
 		return
 	}
 	now := time.Now()
 	if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt {
 		e.rcvAutoParams.copied += copied
 		e.rcvListMu.Unlock()
+		e.mu.RUnlock()
 		return
 	}
 	prevRTTCopied := e.rcvAutoParams.copied + copied
@@ -1008,7 +1011,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 			e.rcvBufSize = rcvWnd
 			availAfter := e.receiveBufferAvailableLocked()
 			mask := uint32(notifyReceiveWindowChanged)
-			if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
+			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 				mask |= notifyNonZeroReceiveWindow
 			}
 			e.notifyProtocolGoroutine(mask)
@@ -1023,6 +1026,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 	e.rcvAutoParams.measureTime = now
 	e.rcvAutoParams.copied = 0
 	e.rcvListMu.Unlock()
+	e.mu.RUnlock()
 }
 
 // IPTables implements tcpip.Endpoint.IPTables.
@@ -1052,7 +1056,6 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 	v, err := e.readLocked()
 	e.rcvListMu.Unlock()
-
 	e.mu.RUnlock()
 
 	if err == tcpip.ErrClosedForReceive {
@@ -1085,7 +1088,7 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	// enough buffer space, to either fit an aMSS or half a receive buffer
 	// (whichever smaller), then notify the protocol goroutine to send a
 	// window update.
-	if crossed, above := e.windowCrossedACKThreshold(len(v)); crossed && above {
+	if crossed, above := e.windowCrossedACKThresholdLocked(len(v)); crossed && above {
 		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 	}
 
@@ -1303,9 +1306,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	return num, tcpip.ControlMessages{}, nil
 }
 
-// windowCrossedACKThreshold checks if the receive window to be announced now
-// would be under aMSS or under half receive buffer, whichever smaller. This is
-// useful as a receive side silly window syndrome prevention mechanism. If
+// windowCrossedACKThresholdLocked checks if the receive window to be announced
+// now would be under aMSS or under half receive buffer, whichever smaller. This
+// is useful as a receive side silly window syndrome prevention mechanism. If
 // window grows to reasonable value, we should send ACK to the sender to inform
 // the rx space is now large. We also want ensure a series of small read()'s
 // won't trigger a flood of spurious tiny ACK's.
@@ -1316,7 +1319,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 // crossed will be true if the window size crossed the ACK threshold.
 // above will be true if the new window is >= ACK threshold and false
 // otherwise.
-func (e *endpoint) windowCrossedACKThreshold(deltaBefore int) (crossed bool, above bool) {
+//
+// Precondition: e.mu and e.rcvListMu must be held.
+func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) {
 	newAvail := e.receiveBufferAvailableLocked()
 	oldAvail := newAvail - deltaBefore
 	if oldAvail < 0 {
@@ -1379,6 +1384,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 
 		mask := uint32(notifyReceiveWindowChanged)
 
+		e.mu.RLock()
 		e.rcvListMu.Lock()
 
 		// Make sure the receive buffer size allows us to send a
@@ -1405,11 +1411,11 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		// Immediately send an ACK to uncork the sender silly window
 		// syndrome prevetion, when our available space grows above aMSS
 		// or half receive buffer, whichever smaller.
-		if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
+		if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 			mask |= notifyNonZeroReceiveWindow
 		}
 		e.rcvListMu.Unlock()
-
+		e.mu.RUnlock()
 		e.notifyProtocolGoroutine(mask)
 		return nil
 
@@ -2414,13 +2420,14 @@ func (e *endpoint) updateSndBufferUsage(v int) {
 // to be read, or when the connection is closed for receiving (in which case
 // s will be nil).
 func (e *endpoint) readyToRead(s *segment) {
+	e.mu.RLock()
 	e.rcvListMu.Lock()
 	if s != nil {
 		s.incRef()
 		e.rcvBufUsed += s.data.Size()
 		// Increase counter if the receive window falls down below MSS
 		// or half receive buffer size, whichever smaller.
-		if crossed, above := e.windowCrossedACKThreshold(-s.data.Size()); crossed && !above {
+		if crossed, above := e.windowCrossedACKThresholdLocked(-s.data.Size()); crossed && !above {
 			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
 		}
 		e.rcvList.PushBack(s)
@@ -2428,7 +2435,7 @@ func (e *endpoint) readyToRead(s *segment) {
 		e.rcvClosed = true
 	}
 	e.rcvListMu.Unlock()
-
+	e.mu.RUnlock()
 	e.waiterQueue.Notify(waiter.EventIn)
 }
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index c4591a3b9..579463384 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1349,6 +1349,21 @@ TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) {
               SyscallFailsWithErrno(ENOTCONN));
 }
 
+TEST_P(SimpleTcpSocketTest, TCPConnectSoRcvBufRace) {
+  auto s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(GetParam(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  RetryEINTR(connect)(s.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      addrlen);
+  int buf_sz = 1 << 18;
+  EXPECT_THAT(
+      setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &buf_sz, sizeof(buf_sz)),
+      SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 43abb24657e737dee1108ff0d512b2e1b6d8a3f6 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Mon, 2 Mar 2020 16:30:51 -0800
Subject: Fix panic caused by invalid address for Bind in packet sockets.

PiperOrigin-RevId: 298476533
---
 pkg/sentry/socket/netstack/netstack.go |  4 ++++
 test/syscalls/linux/packet_socket.cc   | 13 +++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'test/syscalls')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 1eeb37446..13a9a60b4 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -712,6 +712,10 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	if len(sockaddr) < 2 {
+		return syserr.ErrInvalidArgument
+	}
+
 	family := usermem.ByteOrder.Uint16(sockaddr)
 	var addr tcpip.FullAddress
 
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index bc22de788..248762ca9 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -417,6 +417,19 @@ TEST_P(CookedPacketTest, BindDrop) {
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
 }
 
+// Bind with invalid address.
+TEST_P(CookedPacketTest, BindFail) {
+  // Null address.
+  ASSERT_THAT(bind(socket_, nullptr, sizeof(struct sockaddr)),
+              SyscallFailsWithErrno(EFAULT));
+
+  // Address of size 1.
+  uint8_t addr = 0;
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallFailsWithErrno(EINVAL));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
-- 
cgit v1.2.3


From fc3a09cd3c56ef20fd398a5f61a5e59111ed55b3 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Tue, 3 Mar 2020 17:45:10 +0800
Subject: code clean: minor changes to compatible with ubuntu18.04

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/syscalls/linux/bad.cc     | 2 +-
 test/syscalls/linux/seccomp.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/bad.cc b/test/syscalls/linux/bad.cc
index adfb149df..a26fc6af3 100644
--- a/test/syscalls/linux/bad.cc
+++ b/test/syscalls/linux/bad.cc
@@ -28,7 +28,7 @@ namespace {
 constexpr uint32_t kNotImplementedSyscall = SYS_get_kernel_syms;
 #elif __aarch64__
 // Use the last of arch_specific_syscalls which are not implemented on arm64.
-constexpr uint32_t kNotImplementedSyscall = SYS_arch_specific_syscall + 15;
+constexpr uint32_t kNotImplementedSyscall = __NR_arch_specific_syscall + 15;
 #endif
 
 TEST(BadSyscallTest, NotImplemented) {
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index cf6499f8b..8e0fc9acc 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -53,7 +53,7 @@ namespace {
 constexpr uint32_t kFilteredSyscall = SYS_vserver;
 #elif __aarch64__
 // Use the last of arch_specific_syscalls which are not implemented on arm64.
-constexpr uint32_t kFilteredSyscall = SYS_arch_specific_syscall + 15;
+constexpr uint32_t kFilteredSyscall = __NR_arch_specific_syscall + 15;
 #endif
 
 // Applies a seccomp-bpf filter that returns `filtered_result` for
-- 
cgit v1.2.3


From b3c549d8391e7cadd82a5ab9280bc63bb372aa97 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 3 Mar 2020 12:36:37 -0800
Subject: Move temp_umask to test/util.

PiperOrigin-RevId: 298667595
---
 test/syscalls/linux/BUILD          |  9 ++-------
 test/syscalls/linux/mkdir.cc       |  2 +-
 test/syscalls/linux/open_create.cc |  2 +-
 test/syscalls/linux/temp_umask.h   | 39 --------------------------------------
 test/util/BUILD                    |  6 ++++++
 test/util/temp_umask.h             | 39 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 49 insertions(+), 48 deletions(-)
 delete mode 100644 test/syscalls/linux/temp_umask.h
 create mode 100644 test/util/temp_umask.h

(limited to 'test/syscalls')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 70c120e42..dae2b1077 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -166,11 +166,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "temp_umask",
-    hdrs = ["temp_umask.h"],
-)
-
 cc_library(
     name = "unix_domain_socket_test_util",
     testonly = 1,
@@ -1140,11 +1135,11 @@ cc_binary(
     srcs = ["mkdir.cc"],
     linkstatic = 1,
     deps = [
-        ":temp_umask",
         "//test/util:capability_util",
         "//test/util:fs_util",
         gtest,
         "//test/util:temp_path",
+        "//test/util:temp_umask",
         "//test/util:test_main",
         "//test/util:test_util",
     ],
@@ -1299,12 +1294,12 @@ cc_binary(
     srcs = ["open_create.cc"],
     linkstatic = 1,
     deps = [
-        ":temp_umask",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
         gtest,
         "//test/util:temp_path",
+        "//test/util:temp_umask",
         "//test/util:test_main",
         "//test/util:test_util",
     ],
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index cf138d328..def4c50a4 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -18,10 +18,10 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/temp_umask.h"
 #include "test/util/capability_util.h"
 #include "test/util/fs_util.h"
 #include "test/util/temp_path.h"
+#include "test/util/temp_umask.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index 902d0a0dc..51eacf3f2 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -19,11 +19,11 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/temp_umask.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
 #include "test/util/temp_path.h"
+#include "test/util/temp_umask.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
diff --git a/test/syscalls/linux/temp_umask.h b/test/syscalls/linux/temp_umask.h
deleted file mode 100644
index 81a25440c..000000000
--- a/test/syscalls/linux/temp_umask.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
-#define GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
-
-#include <sys/stat.h>
-#include <sys/types.h>
-
-namespace gvisor {
-namespace testing {
-
-class TempUmask {
- public:
-  // Sets the process umask to `mask`.
-  explicit TempUmask(mode_t mask) : old_mask_(umask(mask)) {}
-
-  // Sets the process umask to its previous value.
-  ~TempUmask() { umask(old_mask_); }
-
- private:
-  mode_t old_mask_;
-};
-
-}  // namespace testing
-}  // namespace gvisor
-
-#endif  // GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
diff --git a/test/util/BUILD b/test/util/BUILD
index 8b5a0f25c..2a17c33ee 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -350,3 +350,9 @@ cc_library(
         ":save_util",
     ],
 )
+
+cc_library(
+    name = "temp_umask",
+    testonly = 1,
+    hdrs = ["temp_umask.h"],
+)
diff --git a/test/util/temp_umask.h b/test/util/temp_umask.h
new file mode 100644
index 000000000..e7de84a54
--- /dev/null
+++ b/test/util/temp_umask.h
@@ -0,0 +1,39 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_TEMP_UMASK_H_
+#define GVISOR_TEST_UTIL_TEMP_UMASK_H_
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+namespace gvisor {
+namespace testing {
+
+class TempUmask {
+ public:
+  // Sets the process umask to `mask`.
+  explicit TempUmask(mode_t mask) : old_mask_(umask(mask)) {}
+
+  // Sets the process umask to its previous value.
+  ~TempUmask() { umask(old_mask_); }
+
+ private:
+  mode_t old_mask_;
+};
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_TEMP_UMASK_H_
-- 
cgit v1.2.3


From 504c9e14d61a9ca9fa3615290a05471684019ecc Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Tue, 3 Mar 2020 15:53:48 -0800
Subject: test/runner: use proper filters for test cases

The benchmark_filter options accepts regex-s, but
the gtest-filter option accepts shell-like wildcards.

Fixes #2034

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 test/runner/gtest/gtest.go             | 7 ++++---
 test/syscalls/linux/tuntap_hostinet.cc | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'test/syscalls')

diff --git a/test/runner/gtest/gtest.go b/test/runner/gtest/gtest.go
index f96e2415e..869169ad5 100644
--- a/test/runner/gtest/gtest.go
+++ b/test/runner/gtest/gtest.go
@@ -66,13 +66,12 @@ func (tc TestCase) Args() []string {
 	}
 	if tc.benchmark {
 		return []string{
-			fmt.Sprintf("%s=^$", filterTestFlag),
 			fmt.Sprintf("%s=^%s$", filterBenchmarkFlag, tc.Name),
+			fmt.Sprintf("%s=", filterTestFlag),
 		}
 	}
 	return []string{
-		fmt.Sprintf("%s=^%s$", filterTestFlag, tc.FullName()),
-		fmt.Sprintf("%s=^$", filterBenchmarkFlag),
+		fmt.Sprintf("%s=%s", filterTestFlag, tc.FullName()),
 	}
 }
 
@@ -147,6 +146,8 @@ func ParseTestCases(testBin string, benchmarks bool, extraArgs ...string) ([]Tes
 		return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v\nstderr\n%s", err, exitErr.Stderr)
 	}
 
+	out = []byte(strings.Trim(string(out), "\n"))
+
 	// Parse benchmark output.
 	for _, line := range strings.Split(string(out), "\n") {
 		// Strip comments.
diff --git a/test/syscalls/linux/tuntap_hostinet.cc b/test/syscalls/linux/tuntap_hostinet.cc
index 0c527419e..1513fb9d5 100644
--- a/test/syscalls/linux/tuntap_hostinet.cc
+++ b/test/syscalls/linux/tuntap_hostinet.cc
@@ -26,6 +26,7 @@ namespace {
 
 TEST(TuntapHostInetTest, NoNetTun) {
   SKIP_IF(!IsRunningOnGvisor());
+  SKIP_IF(!IsRunningWithHostinet());
 
   struct stat statbuf;
   ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallFailsWithErrno(ENOENT));
-- 
cgit v1.2.3


From da48fc6cca23a38faef51c5b5f8ae609940773a0 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Thu, 5 Mar 2020 18:21:39 -0800
Subject: Stub oom_score_adj and oom_score.

Adds an oom_score_adj and oom_score proc file stub. oom_score_adj accepts
writes of values -1000 to 1000 and persists the value with the task. New tasks
inherit the parent's oom_score_adj.

oom_score is a read-only stub that always returns the value '0'.

Issue #202

PiperOrigin-RevId: 299245355
---
 pkg/sentry/fs/proc/task.go               | 126 ++++++++++++++++++++++++++-----
 pkg/sentry/fsimpl/proc/task.go           |  12 +--
 pkg/sentry/fsimpl/proc/task_files.go     |  43 +++++++++++
 pkg/sentry/fsimpl/proc/tasks_test.go     |  32 ++++----
 pkg/sentry/kernel/task.go                |  33 ++++++++
 pkg/sentry/kernel/task_clone.go          |   6 ++
 pkg/sentry/kernel/task_start.go          |   4 +
 test/syscalls/BUILD                      |   8 +-
 test/syscalls/linux/BUILD                |  13 ++++
 test/syscalls/linux/proc.cc              |  21 ++++++
 test/syscalls/linux/proc_pid_oomscore.cc |  72 ++++++++++++++++++
 11 files changed, 330 insertions(+), 40 deletions(-)
 create mode 100644 test/syscalls/linux/proc_pid_oomscore.cc

(limited to 'test/syscalls')

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 8ab8d8a02..4e9b0fc00 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -72,24 +72,26 @@ var _ fs.InodeOperations = (*taskDir)(nil)
 // newTaskDir creates a new proc task entry.
 func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
 	contents := map[string]*fs.Inode{
-		"auxv":      newAuxvec(t, msrc),
-		"cmdline":   newExecArgInode(t, msrc, cmdlineExecArg),
-		"comm":      newComm(t, msrc),
-		"environ":   newExecArgInode(t, msrc, environExecArg),
-		"exe":       newExe(t, msrc),
-		"fd":        newFdDir(t, msrc),
-		"fdinfo":    newFdInfoDir(t, msrc),
-		"gid_map":   newGIDMap(t, msrc),
-		"io":        newIO(t, msrc, isThreadGroup),
-		"maps":      newMaps(t, msrc),
-		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
-		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
-		"ns":        newNamespaceDir(t, msrc),
-		"smaps":     newSmaps(t, msrc),
-		"stat":      newTaskStat(t, msrc, isThreadGroup, p.pidns),
-		"statm":     newStatm(t, msrc),
-		"status":    newStatus(t, msrc, p.pidns),
-		"uid_map":   newUIDMap(t, msrc),
+		"auxv":          newAuxvec(t, msrc),
+		"cmdline":       newExecArgInode(t, msrc, cmdlineExecArg),
+		"comm":          newComm(t, msrc),
+		"environ":       newExecArgInode(t, msrc, environExecArg),
+		"exe":           newExe(t, msrc),
+		"fd":            newFdDir(t, msrc),
+		"fdinfo":        newFdInfoDir(t, msrc),
+		"gid_map":       newGIDMap(t, msrc),
+		"io":            newIO(t, msrc, isThreadGroup),
+		"maps":          newMaps(t, msrc),
+		"mountinfo":     seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
+		"mounts":        seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		"ns":            newNamespaceDir(t, msrc),
+		"oom_score":     newOOMScore(t, msrc),
+		"oom_score_adj": newOOMScoreAdj(t, msrc),
+		"smaps":         newSmaps(t, msrc),
+		"stat":          newTaskStat(t, msrc, isThreadGroup, p.pidns),
+		"statm":         newStatm(t, msrc),
+		"status":        newStatus(t, msrc, p.pidns),
+		"uid_map":       newUIDMap(t, msrc),
 	}
 	if isThreadGroup {
 		contents["task"] = p.newSubtasks(t, msrc)
@@ -796,4 +798,92 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc
 	return int64(n), err
 }
 
+// newOOMScore returns a oom_score file. It is a stub that always returns 0.
+// TODO(gvisor.dev/issue/1967)
+func newOOMScore(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	return newStaticProcInode(t, msrc, []byte("0\n"))
+}
+
+// oomScoreAdj is a file containing the oom_score adjustment for a task.
+//
+// +stateify savable
+type oomScoreAdj struct {
+	fsutil.SimpleFileInode
+
+	t *kernel.Task
+}
+
+// +stateify savable
+type oomScoreAdjFile struct {
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+
+	t *kernel.Task
+}
+
+// newOOMScoreAdj returns a oom_score_adj file.
+func newOOMScoreAdj(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	i := &oomScoreAdj{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
+		t:               t,
+	}
+	return newProcInode(t, i, msrc, fs.SpecialFile, t)
+}
+
+// Truncate implements fs.InodeOperations.Truncate. Truncate is called when
+// O_TRUNC is specified for any kind of existing Dirent but is not called via
+// (f)truncate for proc files.
+func (*oomScoreAdj) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &oomScoreAdjFile{t: o.t}), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		return 0, io.EOF
+	}
+	adj, err := f.t.OOMScoreAdj()
+	if err != nil {
+		return 0, err
+	}
+	adjBytes := []byte(strconv.FormatInt(int64(adj), 10) + "\n")
+	n, err := dst.CopyOut(ctx, adjBytes)
+	return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+
+	if err := f.t.SetOOMScoreAdj(v); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
+
 // LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go)
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 2d814668a..18e5cd6f6 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -62,11 +62,13 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames
 			"pid":  newNamespaceSymlink(task, inoGen.NextIno(), "pid"),
 			"user": newNamespaceSymlink(task, inoGen.NextIno(), "user"),
 		}),
-		"smaps":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
-		"stat":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
-		"statm":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}),
-		"status":  newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
-		"uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}),
+		"oom_score":     newTaskOwnedFile(task, inoGen.NextIno(), 0444, newStaticFile("0\n")),
+		"oom_score_adj": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &oomScoreAdj{task: task}),
+		"smaps":         newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
+		"stat":          newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":         newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}),
+		"status":        newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+		"uid_map":       newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}),
 	}
 	if isThreadGroup {
 		contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers)
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index efd3b3453..5a231ac86 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -525,3 +525,46 @@ func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
 	return nil
 }
+
+// oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file.
+//
+// +stateify savable
+type oomScoreAdj struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	adj, err := o.task.OOMScoreAdj()
+	if err != nil {
+		return err
+	}
+	fmt.Fprintf(buf, "%d\n", adj)
+	return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+
+	if err := o.task.SetOOMScoreAdj(v); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index c5d531fe0..0eb401619 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -63,21 +63,23 @@ var (
 		"thread-self": threadSelfLink.NextOff,
 	}
 	taskStaticFiles = map[string]testutil.DirentType{
-		"auxv":    linux.DT_REG,
-		"cgroup":  linux.DT_REG,
-		"cmdline": linux.DT_REG,
-		"comm":    linux.DT_REG,
-		"environ": linux.DT_REG,
-		"gid_map": linux.DT_REG,
-		"io":      linux.DT_REG,
-		"maps":    linux.DT_REG,
-		"ns":      linux.DT_DIR,
-		"smaps":   linux.DT_REG,
-		"stat":    linux.DT_REG,
-		"statm":   linux.DT_REG,
-		"status":  linux.DT_REG,
-		"task":    linux.DT_DIR,
-		"uid_map": linux.DT_REG,
+		"auxv":          linux.DT_REG,
+		"cgroup":        linux.DT_REG,
+		"cmdline":       linux.DT_REG,
+		"comm":          linux.DT_REG,
+		"environ":       linux.DT_REG,
+		"gid_map":       linux.DT_REG,
+		"io":            linux.DT_REG,
+		"maps":          linux.DT_REG,
+		"ns":            linux.DT_DIR,
+		"oom_score":     linux.DT_REG,
+		"oom_score_adj": linux.DT_REG,
+		"smaps":         linux.DT_REG,
+		"stat":          linux.DT_REG,
+		"statm":         linux.DT_REG,
+		"status":        linux.DT_REG,
+		"task":          linux.DT_DIR,
+		"uid_map":       linux.DT_REG,
 	}
 )
 
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 2cee2e6ed..c0dbbe890 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -554,6 +555,13 @@ type Task struct {
 	//
 	// startTime is protected by mu.
 	startTime ktime.Time
+
+	// oomScoreAdj is the task's OOM score adjustment. This is currently not
+	// used but is maintained for consistency.
+	// TODO(gvisor.dev/issue/1967)
+	//
+	// oomScoreAdj is protected by mu, and is owned by the task goroutine.
+	oomScoreAdj int32
 }
 
 func (t *Task) savePtraceTracer() *Task {
@@ -847,3 +855,28 @@ func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 func (t *Task) ContainerID() string {
 	return t.containerID
 }
+
+// OOMScoreAdj gets the task's OOM score adjustment.
+func (t *Task) OOMScoreAdj() (int32, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.ExitState() == TaskExitDead {
+		return 0, syserror.ESRCH
+	}
+	return t.oomScoreAdj, nil
+}
+
+// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be
+// between -1000 and 1000 inclusive.
+func (t *Task) SetOOMScoreAdj(adj int32) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.ExitState() == TaskExitDead {
+		return syserror.ESRCH
+	}
+	if adj > 1000 || adj < -1000 {
+		return syserror.EINVAL
+	}
+	t.oomScoreAdj = adj
+	return nil
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 78866f280..dda502bb8 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -264,6 +264,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		rseqSignature = t.rseqSignature
 	}
 
+	adj, err := t.OOMScoreAdj()
+	if err != nil {
+		return 0, nil, err
+	}
+
 	cfg := &TaskConfig{
 		Kernel:                  t.k,
 		ThreadGroup:             tg,
@@ -282,6 +287,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		RSeqAddr:                rseqAddr,
 		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
+		OOMScoreAdj:             adj,
 	}
 	if opts.NewThreadGroup {
 		cfg.Parent = t
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index a5035bb7f..2bbf48bb8 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -93,6 +93,9 @@ type TaskConfig struct {
 
 	// ContainerID is the container the new task belongs to.
 	ContainerID string
+
+	// oomScoreAdj is the task's OOM score adjustment.
+	OOMScoreAdj int32
 }
 
 // NewTask creates a new task defined by cfg.
@@ -143,6 +146,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		rseqSignature:      cfg.RSeqSignature,
 		futexWaiter:        futex.NewWaiter(),
 		containerID:        cfg.ContainerID,
+		oomScoreAdj:        cfg.OOMScoreAdj,
 	}
 	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index a69b0ce13..9800a0cdf 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -318,10 +318,14 @@ syscall_test(
     test = "//test/syscalls/linux:proc_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test")
-
 syscall_test(test = "//test/syscalls/linux:proc_net_test")
 
+syscall_test(test = "//test/syscalls/linux:proc_pid_oomscore_test")
+
+syscall_test(test = "//test/syscalls/linux:proc_pid_smaps_test")
+
+syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test")
+
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:pselect_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 0fbd556de..43455f1a3 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1631,6 +1631,19 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "proc_pid_oomscore_test",
+    testonly = 1,
+    srcs = ["proc_pid_oomscore.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:fs_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_binary(
     name = "proc_pid_smaps_test",
     testonly = 1,
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index f91187e75..5a70f6c3b 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1431,6 +1431,12 @@ TEST(ProcPidFile, SubprocessRunning) {
 
   EXPECT_THAT(ReadWhileRunning("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("oom_score", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("oom_score_adj", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
 }
 
 // Test whether /proc/PID/ files can be read for a zombie process.
@@ -1466,6 +1472,12 @@ TEST(ProcPidFile, SubprocessZombie) {
   EXPECT_THAT(ReadWhileZombied("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
 
+  EXPECT_THAT(ReadWhileZombied("oom_score", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileZombied("oom_score_adj", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
   //
@@ -1527,6 +1539,15 @@ TEST(ProcPidFile, SubprocessExited) {
 
   EXPECT_THAT(ReadWhileExited("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
+    EXPECT_THAT(ReadWhileExited("oom_score", buf, sizeof(buf)),
+                SyscallFailsWithErrno(ESRCH));
+  }
+
+  EXPECT_THAT(ReadWhileExited("oom_score_adj", buf, sizeof(buf)),
+              SyscallFailsWithErrno(ESRCH));
 }
 
 PosixError DirContainsImpl(absl::string_view path,
diff --git a/test/syscalls/linux/proc_pid_oomscore.cc b/test/syscalls/linux/proc_pid_oomscore.cc
new file mode 100644
index 000000000..707821a3f
--- /dev/null
+++ b/test/syscalls/linux/proc_pid_oomscore.cc
@@ -0,0 +1,72 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+
+#include <exception>
+#include <iostream>
+#include <string>
+
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<int> ReadProcNumber(std::string path) {
+  ASSIGN_OR_RETURN_ERRNO(std::string contents, GetContents(path));
+  EXPECT_EQ(contents[contents.length() - 1], '\n');
+
+  int num;
+  if (!absl::SimpleAtoi(contents, &num)) {
+    return PosixError(EINVAL, "invalid value: " + contents);
+  }
+
+  return num;
+}
+
+TEST(ProcPidOomscoreTest, BasicRead) {
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score"));
+  EXPECT_LE(oom_score, 1000);
+  EXPECT_GE(oom_score, -1000);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicRead) {
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+
+  // oom_score_adj defaults to 0.
+  EXPECT_EQ(oom_score, 0);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicWrite) {
+  constexpr int test_value = 7;
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/oom_score_adj", O_WRONLY));
+  ASSERT_THAT(
+      RetryEINTR(write)(fd.get(), std::to_string(test_value).c_str(), 1),
+      SyscallSucceeds());
+
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+  EXPECT_EQ(oom_score, test_value);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From d5dbe366bf7c9f5b648b8114a9dc7f45589899b1 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 6 Mar 2020 11:41:10 -0800
Subject: shutdown(s, SHUT_WR) in TIME-WAIT returns ENOTCONN

From RFC 793 s3.9 p61 Event Processing:

CLOSE Call during TIME-WAIT: return with "error: connection closing"

Fixes #1603

PiperOrigin-RevId: 299401353
---
 pkg/tcpip/transport/tcp/endpoint.go |  5 ++++-
 test/syscalls/linux/tcp_socket.cc   | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'test/syscalls')

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 40cc664c0..dc9c18b6f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2117,10 +2117,13 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 		// Close for write.
 		if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 {
 			e.sndBufMu.Lock()
-
 			if e.sndClosed {
 				// Already closed.
 				e.sndBufMu.Unlock()
+				if e.EndpointState() == StateTimeWait {
+					e.mu.Unlock()
+					return tcpip.ErrNotConnected
+				}
 				break
 			}
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 579463384..d9c1ac0e1 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -143,6 +143,20 @@ TEST_P(TcpSocketTest, ConnectOnEstablishedConnection) {
       SyscallFailsWithErrno(EISCONN));
 }
 
+TEST_P(TcpSocketTest, ShutdownWriteInTimeWait) {
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  EXPECT_THAT(shutdown(s_, SHUT_RDWR), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));  // Wait to enter TIME_WAIT.
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(TcpSocketTest, ShutdownWriteInFinWait1) {
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));  // Wait to enter FIN-WAIT2.
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+}
+
 TEST_P(TcpSocketTest, DataCoalesced) {
   char buf[10];
 
-- 
cgit v1.2.3