From e3d4a6773923a884986aaa4bb272431ce27764e2 Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antfin.com>
Date: Tue, 2 Apr 2019 17:13:00 +0800
Subject: support /proc/net/snmp

This proc file contains statistics according to [1].

[1] https://tools.ietf.org/html/rfc2013

Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
Change-Id: I9662132085edd8a7783d356ce4237d7ac0800d94
---
 test/syscalls/linux/BUILD       |   1 +
 test/syscalls/linux/proc_net.cc | 213 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 213 insertions(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 84a8eb76c..d243be9e4 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1552,6 +1552,7 @@ cc_binary(
     srcs = ["proc_net.cc"],
     linkstatic = 1,
     deps = [
+        ":socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index efdaf202b..af4cd616a 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -12,9 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gtest/gtest.h"
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+
+#include "absl/strings/str_split.h"
 #include "gtest/gtest.h"
 #include "test/util/capability_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
 #include "test/util/test_util.h"
@@ -57,6 +65,209 @@ TEST(ProcSysNetIpv4Sack, CanReadAndWrite) {
   EXPECT_EQ(buf, to_write);
 }
 
+PosixErrorOr<uint64_t> GetSNMPMetricFromProc(const std::string snmp,
+                                             const std::string &type,
+                                             const std::string &item) {
+  std::vector<std::string> snmp_vec = absl::StrSplit(snmp, '\n');
+
+  // /proc/net/snmp prints a line of headers followed by a line of metrics.
+  // Only search the headers.
+  for (unsigned i = 0; i < snmp_vec.size(); i = i + 2) {
+    if (!absl::StartsWith(snmp_vec[i], type)) continue;
+
+    std::vector<std::string> fields =
+        absl::StrSplit(snmp_vec[i], ' ', absl::SkipWhitespace());
+
+    EXPECT_TRUE((i + 1) < snmp_vec.size());
+    std::vector<std::string> values =
+        absl::StrSplit(snmp_vec[i + 1], ' ', absl::SkipWhitespace());
+
+    EXPECT_TRUE(!fields.empty() && fields.size() == values.size());
+
+    // Metrics start at the first index.
+    for (unsigned j = 1; j < fields.size(); j++) {
+      if (fields[j] == item) {
+        uint64_t val;
+        if (!absl::SimpleAtoi(values[j], &val)) {
+          return PosixError(EINVAL,
+                            absl::StrCat("field is not a number: ", values[j]));
+        }
+
+        return val;
+      }
+    }
+  }
+  // We should never get here.
+  return PosixError(
+      EINVAL, absl::StrCat("failed to find ", type, "/", item, " in:", snmp));
+}
+
+TEST(ProcNetSnmp, TcpReset) {
+  // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+  const DisableSave ds;
+
+  uint64_t oldAttemptFails;
+  uint64_t oldActiveOpens;
+  uint64_t oldOutRsts;
+  auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  oldActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  oldOutRsts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "OutRsts"));
+  oldAttemptFails = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "AttemptFails"));
+
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
+
+  struct sockaddr_in sin = {
+    .sin_family = AF_INET,
+    .sin_port = htons(1234),
+  };
+  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+  ASSERT_THAT(connect(s.get(), (struct sockaddr *)&sin, sizeof(sin)),
+              SyscallFailsWithErrno(ECONNREFUSED));
+
+  uint64_t newAttemptFails;
+  uint64_t newActiveOpens;
+  uint64_t newOutRsts;
+  snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  newActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  newOutRsts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "OutRsts"));
+  newAttemptFails = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "AttemptFails"));
+
+  EXPECT_EQ(oldActiveOpens, newActiveOpens - 1);
+  EXPECT_EQ(oldOutRsts, newOutRsts - 1);
+  EXPECT_EQ(oldAttemptFails, newAttemptFails - 1);
+}
+
+TEST(ProcNetSnmp, TcpEstab) {
+  // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+  const DisableSave ds;
+
+  uint64_t oldEstabResets;
+  uint64_t oldActiveOpens;
+  uint64_t oldPassiveOpens;
+  uint64_t oldCurrEstab;
+  auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  oldActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  oldPassiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "PassiveOpens"));
+  oldCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+  oldEstabResets = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
+
+  FileDescriptor s_listen =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
+
+  struct sockaddr_in sin = {
+    .sin_family = AF_INET,
+    .sin_port = htons(1234),
+  };
+  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+  ASSERT_THAT(bind(s_listen.get(), (struct sockaddr *)&sin, sizeof(sin)),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(s_listen.get(), 1), SyscallSucceeds());
+
+  FileDescriptor s_connect =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
+  ASSERT_THAT(connect(s_connect.get(), (struct sockaddr *)&sin, sizeof(sin)),
+              SyscallSucceeds());
+
+  auto s_accept =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(s_listen.get(), nullptr, nullptr));
+
+  uint64_t newEstabResets;
+  uint64_t newActiveOpens;
+  uint64_t newPassiveOpens;
+  uint64_t newCurrEstab;
+  snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  newActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+  newPassiveOpens = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "PassiveOpens"));
+  newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+
+  EXPECT_EQ(oldActiveOpens, newActiveOpens - 1);
+  EXPECT_EQ(oldPassiveOpens, newPassiveOpens - 1);
+  EXPECT_EQ(oldCurrEstab, newCurrEstab - 2);
+
+  ASSERT_THAT(send(s_connect.get(), "a", 1, 0), SyscallSucceedsWithValue(1));
+
+  s_accept.reset(-1);
+  s_connect.reset(-1);
+
+  snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+  newEstabResets = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
+
+  EXPECT_EQ(oldCurrEstab, newCurrEstab);
+  EXPECT_EQ(oldEstabResets, newEstabResets - 2);
+}
+
+TEST(ProcNetSnmp, UdpNoPorts) {
+  // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+  const DisableSave ds;
+
+  uint64_t oldOutDatagrams;
+  uint64_t oldNoPorts;
+  auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  oldOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  oldNoPorts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "NoPorts"));
+
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  struct sockaddr_in sin = {
+    .sin_family = AF_INET,
+    .sin_port = htons(1234),
+  };
+  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+  ASSERT_THAT(sendto(s.get(), "a", 1, 0, (struct sockaddr *)&sin, sizeof(sin)),
+              SyscallSucceedsWithValue(1));
+
+  uint64_t newOutDatagrams;
+  uint64_t newNoPorts;
+  snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  newOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  newNoPorts = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "NoPorts"));
+
+  EXPECT_EQ(oldOutDatagrams, newOutDatagrams - 1);
+  EXPECT_EQ(oldNoPorts, newNoPorts - 1);
+}
+
+TEST(ProcNetSnmp, UdpIn) {
+  // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+  const DisableSave ds;
+
+  uint64_t oldOutDatagrams;
+  uint64_t oldInDatagrams;
+  auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  oldOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  oldInDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "InDatagrams"));
+
+  FileDescriptor server =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  struct sockaddr_in sin = {
+    .sin_family = AF_INET,
+    .sin_port = htons(1234),
+  };
+  sin.sin_addr.s_addr = inet_addr("127.0.0.1");
+  ASSERT_THAT(bind(server.get(), (struct sockaddr *)&sin, sizeof(sin)),
+      SyscallSucceeds());
+
+  FileDescriptor client =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  ASSERT_THAT(sendto(client.get(), "a", 1, 0, (struct sockaddr *)&sin,
+                     sizeof(sin)), SyscallSucceedsWithValue(1));
+
+  char buf[128];
+  ASSERT_THAT(recvfrom(server.get(), buf, sizeof(buf), 0, NULL, NULL),
+              SyscallSucceedsWithValue(1));
+
+  uint64_t newOutDatagrams;
+  uint64_t newInDatagrams;
+  snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+  newOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+  newInDatagrams = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "InDatagrams"));
+
+  EXPECT_EQ(oldOutDatagrams, newOutDatagrams - 1);
+  EXPECT_EQ(oldInDatagrams, newInDatagrams - 1);
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From d277bfba2702b319d8336b65429cf8775661ea2f Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antfin.com>
Date: Mon, 20 May 2019 11:26:10 +0000
Subject: epsocket: support /proc/net/snmp

Netstack has its own stats, we use this to fill /proc/net/snmp.

Note that some metrics are not recorded in Netstack, which will be shown
as 0 in the proc file.

Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
Change-Id: Ie0089184507d16f49bc0057b4b0482094417ebe1
---
 pkg/sentry/socket/netstack/stack.go | 93 ++++++++++++++++++++++++++++++++++++-
 pkg/tcpip/transport/tcp/accept.go   |  6 +--
 pkg/tcpip/transport/tcp/connect.go  | 12 ++---
 test/syscalls/linux/proc_net.cc     | 23 +++++++++
 4 files changed, 121 insertions(+), 13 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index fda0156e5..d5db8c17c 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -144,7 +144,98 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
 
 // Statistics implements inet.Stack.Statistics.
 func (s *Stack) Statistics(stat interface{}, arg string) error {
-	return syserr.ErrEndpointOperation.ToError()
+	switch stats := stat.(type) {
+	case *inet.StatSNMPIP:
+		ip := Metrics.IP
+		*stats = inet.StatSNMPIP{
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/Forwarding.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL.
+			ip.PacketsReceived.Value(),          // InReceives.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors.
+			ip.InvalidAddressesReceived.Value(), // InAddrErrors.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/InDiscards.
+			ip.PacketsDelivered.Value(),         // InDelivers.
+			ip.PacketsSent.Value(),              // OutRequests.
+			ip.OutgoingPacketErrors.Value(),     // OutDiscards.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/FragOKs.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/FragFails.
+			0,                                   // TODO(gvisor.dev/issue/969): Support Ip/FragCreates.
+		}
+	case *inet.StatSNMPICMP:
+		in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats
+		out := Metrics.ICMP.V4PacketsSent.ICMPv4PacketStats
+		*stats = inet.StatSNMPICMP{
+			0, // TODO(gvisor.dev/issue/969): Support Icmp/InMsgs.
+			Metrics.ICMP.V4PacketsSent.Dropped.Value(), // InErrors.
+			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/InCsumErrors.
+			in.DstUnreachable.Value(), // InDestUnreachs.
+			in.TimeExceeded.Value(),   // InTimeExcds.
+			in.ParamProblem.Value(),   // InParmProbs.
+			in.SrcQuench.Value(),      // InSrcQuenchs.
+			in.Redirect.Value(),       // InRedirects.
+			in.Echo.Value(),           // InEchos.
+			in.EchoReply.Value(),      // InEchoReps.
+			in.Timestamp.Value(),      // InTimestamps.
+			in.TimestampReply.Value(), // InTimestampReps.
+			in.InfoRequest.Value(),    // InAddrMasks.
+			in.InfoReply.Value(),      // InAddrMaskReps.
+			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/OutMsgs.
+			Metrics.ICMP.V4PacketsReceived.Invalid.Value(), // OutErrors.
+			out.DstUnreachable.Value(),                     // OutDestUnreachs.
+			out.TimeExceeded.Value(),                       // OutTimeExcds.
+			out.ParamProblem.Value(),                       // OutParmProbs.
+			out.SrcQuench.Value(),                          // OutSrcQuenchs.
+			out.Redirect.Value(),                           // OutRedirects.
+			out.Echo.Value(),                               // OutEchos.
+			out.EchoReply.Value(),                          // OutEchoReps.
+			out.Timestamp.Value(),                          // OutTimestamps.
+			out.TimestampReply.Value(),                     // OutTimestampReps.
+			out.InfoRequest.Value(),                        // OutAddrMasks.
+			out.InfoReply.Value(),                          // OutAddrMaskReps.
+		}
+	case *inet.StatSNMPTCP:
+		tcp := Metrics.TCP
+		// RFC 2012 (updates 1213):  SNMPv2-MIB-TCP.
+		*stats = inet.StatSNMPTCP{
+			1,                                     // RtoAlgorithm.
+			200,                                   // RtoMin.
+			120000,                                // RtoMax.
+			(1<<64 - 1),                           // MaxConn.
+			tcp.ActiveConnectionOpenings.Value(),  // ActiveOpens.
+			tcp.PassiveConnectionOpenings.Value(), // PassiveOpens.
+			tcp.FailedConnectionAttempts.Value(),  // AttemptFails.
+			tcp.EstablishedResets.Value(),         // EstabResets.
+			tcp.CurrentEstablished.Value(),        // CurrEstab.
+			tcp.ValidSegmentsReceived.Value(),     // InSegs.
+			tcp.SegmentsSent.Value(),              // OutSegs.
+			tcp.Retransmits.Value(),               // RetransSegs.
+			tcp.InvalidSegmentsReceived.Value(),   // InErrs.
+			tcp.ResetsSent.Value(),                // OutRsts.
+			tcp.ChecksumErrors.Value(),            // InCsumErrors.
+		}
+	case *inet.StatSNMPUDP:
+		udp := Metrics.UDP
+		*stats = inet.StatSNMPUDP{
+			udp.PacketsReceived.Value(),     // InDatagrams.
+			udp.UnknownPortErrors.Value(),   // NoPorts.
+			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InErrors.
+			udp.PacketsSent.Value(),         // OutDatagrams.
+			udp.ReceiveBufferErrors.Value(), // RcvbufErrors.
+			0,                               // TODO(gvisor.dev/issue/969): Support Udp/SndbufErrors.
+			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InCsumErrors.
+			0,                               // TODO(gvisor.dev/issue/969): Support Udp/IgnoredMulti.
+		}
+	default:
+		return syserr.ErrEndpointOperation.ToError()
+	}
+	return nil
 }
 
 // RouteTable implements inet.Stack.RouteTable.
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 2b4c5c2f9..65c346046 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -297,10 +297,8 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		return nil, err
 	}
 	ep.mu.Lock()
-	if ep.state != StateEstablished {
-		ep.stack.Stats().TCP.CurrentEstablished.Increment()
-		ep.state = StateEstablished
-	}
+	ep.stack.Stats().TCP.CurrentEstablished.Increment()
+	ep.state = StateEstablished
 	ep.mu.Unlock()
 
 	// Update the receive window scaling. We can't do it before the
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4467dda82..b724d02bb 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -928,10 +928,8 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			e.lastErrorMu.Unlock()
 
 			e.mu.Lock()
-			if e.state == StateEstablished || e.state == StateCloseWait {
-				e.stack.Stats().TCP.EstablishedResets.Increment()
-				e.stack.Stats().TCP.CurrentEstablished.Decrement()
-			}
+			e.stack.Stats().TCP.EstablishedResets.Increment()
+			e.stack.Stats().TCP.CurrentEstablished.Decrement()
 			e.state = StateError
 			e.HardError = err
 
@@ -1126,10 +1124,8 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	// Mark endpoint as closed.
 	e.mu.Lock()
 	if e.state != StateError {
-		if e.state == StateEstablished || e.state == StateCloseWait {
-			e.stack.Stats().TCP.EstablishedResets.Increment()
-			e.stack.Stats().TCP.CurrentEstablished.Decrement()
-		}
+		e.stack.Stats().TCP.EstablishedResets.Increment()
+		e.stack.Stats().TCP.CurrentEstablished.Decrement()
 		e.state = StateClose
 	}
 	// Lock released below.
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index af4cd616a..d0ef8d380 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -15,11 +15,14 @@
 #include <arpa/inet.h>
 #include <errno.h>
 #include <netinet/in.h>
+#include <poll.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/syscall.h>
 
 #include "absl/strings/str_split.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "gtest/gtest.h"
 #include "test/util/capability_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
@@ -184,11 +187,31 @@ TEST(ProcNetSnmp, TcpEstab) {
   EXPECT_EQ(oldPassiveOpens, newPassiveOpens - 1);
   EXPECT_EQ(oldCurrEstab, newCurrEstab - 2);
 
+  // Send 1 byte from client to server.
   ASSERT_THAT(send(s_connect.get(), "a", 1, 0), SyscallSucceedsWithValue(1));
 
+  constexpr int kPollTimeoutMs = 20000;  // Wait up to 20 seconds for the data.
+
+  // Wait until server-side fd sees the data on its side but don't read it.
+  struct pollfd poll_fd = {s_accept.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Now close server-side fd without reading the data which leads to a RST
+  // packet sent to client side.
   s_accept.reset(-1);
+
+  // Wait until client-side fd sees RST packet.
+  struct pollfd poll_fd1 = {s_connect.get(), POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&poll_fd1, 1, kPollTimeoutMs),
+              SyscallSucceedsWithValue(1));
+
+  // Now close client-side fd.
   s_connect.reset(-1);
 
+  // Wait until the process of the netstack.
+  absl::SleepFor(absl::Seconds(1.0));
+
   snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
   newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
   newEstabResets = ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
-- 
cgit v1.2.3


From c0065e296f6e840ec1f6797fb0fd55cde0fff785 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 23 Oct 2019 12:58:40 -0700
Subject: Remove comparison between signed and unsigned int

Some compilers don't like the comparison between int and size_t. Remove it.

The other changes are minor style cleanups.

PiperOrigin-RevId: 276333450
---
 test/syscalls/linux/sendfile_socket.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index 1c56540bc..3331288b7 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -185,7 +185,7 @@ TEST_P(SendFileTest, Shutdown) {
   // Create a socket.
   std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
   const FileDescriptor client(std::get<0>(fds));
-  FileDescriptor server(std::get<1>(fds));  // non-const, released below.
+  FileDescriptor server(std::get<1>(fds));  // non-const, reset below.
 
   // If this is a TCP socket, then turn off linger.
   if (GetParam() == AF_INET) {
@@ -210,14 +210,14 @@ TEST_P(SendFileTest, Shutdown) {
   // checking the contents (other tests do that), so we just re-use the same
   // buffer as above.
   ScopedThread t([&]() {
-    int done = 0;
+    size_t done = 0;
     while (done < data.size()) {
-      int n = read(server.get(), data.data(), data.size());
+      int n = RetryEINTR(read)(server.get(), data.data(), data.size());
       ASSERT_THAT(n, SyscallSucceeds());
       done += n;
     }
     // Close the server side socket.
-    ASSERT_THAT(close(server.release()), SyscallSucceeds());
+    server.reset();
   });
 
   // Continuously stream from the file to the socket. Note we do not assert
-- 
cgit v1.2.3


From 072af49059a1818e0e06188be81fe425363acf55 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 23 Oct 2019 17:20:07 -0700
Subject: Add check for proper settings to AF_PACKET tests.

As in packet_socket_raw.cc, we should check that certain proc files are set
correctly.

PiperOrigin-RevId: 276384534
---
 test/syscalls/linux/packet_socket.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index fcf64ee59..92ae55eec 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -130,6 +130,20 @@ void CookedPacketTest::SetUp() {
     GTEST_SKIP();
   }
 
+  if (!IsRunningOnGvisor()) {
+    FileDescriptor acceptLocal = ASSERT_NO_ERRNO_AND_VALUE(
+        Open("/proc/sys/net/ipv4/conf/lo/accept_local", O_RDONLY));
+    FileDescriptor routeLocalnet = ASSERT_NO_ERRNO_AND_VALUE(
+        Open("/proc/sys/net/ipv4/conf/lo/route_localnet", O_RDONLY));
+    char enabled;
+    ASSERT_THAT(read(acceptLocal.get(), &enabled, 1),
+                SyscallSucceedsWithValue(1));
+    ASSERT_EQ(enabled, '1');
+    ASSERT_THAT(read(routeLocalnet.get(), &enabled, 1),
+                SyscallSucceedsWithValue(1));
+    ASSERT_EQ(enabled, '1');
+  }
+
   ASSERT_THAT(socket_ = socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())),
               SyscallSucceeds());
 }
-- 
cgit v1.2.3


From 7ca50236c42ad1b1aa19951815d03b62c0c722ed Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 23 Oct 2019 22:21:33 -0700
Subject: Handle AT_EMPTY_PATH flag in execveat.

PiperOrigin-RevId: 276419967
---
 pkg/sentry/syscalls/linux/linux64_amd64.go |  2 +-
 pkg/sentry/syscalls/linux/sys_thread.go    | 32 ++++++++++-----
 test/syscalls/linux/exec.cc                | 62 ++++++++++++++++++++++++++++++
 test/util/multiprocess_util.h              |  9 +++++
 4 files changed, 95 insertions(+), 10 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index aedb6d774..6d3801ad9 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -362,7 +362,7 @@ var AMD64 = &kernel.SyscallTable{
 		319: syscalls.Supported("memfd_create", MemfdCreate),
 		320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
 		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
-		322: syscalls.PartiallySupported("execveat", Execveat, "No support for AT_EMPTY_PATH, AT_SYMLINK_FOLLOW.", nil),
+		322: syscalls.PartiallySupported("execveat", Execveat, "No support for AT_SYMLINK_FOLLOW.", nil),
 		323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
 		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(b/118904897)
 		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 6e425f1ec..7ece7ba6f 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -105,18 +105,26 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		}
 	}
 
-	if flags != 0 {
-		// TODO(b/128449944): Handle AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW.
+	if flags&linux.AT_SYMLINK_NOFOLLOW != 0 {
+		// TODO(b/128449944): Handle AT_SYMLINK_NOFOLLOW.
 		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, nil, syserror.ENOSYS
 	}
 
+	atEmptyPath := flags&linux.AT_EMPTY_PATH != 0
+	if !atEmptyPath && len(pathname) == 0 {
+		return 0, nil, syserror.ENOENT
+	}
+
 	root := t.FSContext().RootDirectory()
 	defer root.DecRef()
 
 	var wd *fs.Dirent
+	var executable *fs.File
 	if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
-		// If pathname is absolute, LoadTaskImage() will ignore the wd.
+		// Even if the pathname is absolute, we may still need the wd
+		// for interpreter scripts if the path of the interpreter is
+		// relative.
 		wd = t.FSContext().WorkingDirectory()
 	} else {
 		// Need to extract the given FD.
@@ -126,17 +134,23 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		}
 		defer f.DecRef()
 
-		wd = f.Dirent
-		wd.IncRef()
-		if !fs.IsDir(wd.Inode.StableAttr) {
-			return 0, nil, syserror.ENOTDIR
+		if atEmptyPath && len(pathname) == 0 {
+			executable = f
+		} else {
+			wd = f.Dirent
+			wd.IncRef()
+			if !fs.IsDir(wd.Inode.StableAttr) {
+				return 0, nil, syserror.ENOTDIR
+			}
 		}
 	}
-	defer wd.DecRef()
+	if wd != nil {
+		defer wd.DecRef()
+	}
 
 	// Load the new TaskContext.
 	maxTraversals := uint(linux.MaxSymlinkTraversals)
-	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, pathname, nil, argv, envv, t.Arch().FeatureSet())
+	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, pathname, executable, argv, envv, t.Arch().FeatureSet())
 	if se != nil {
 		return 0, nil, se.ToError()
 	}
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 85734c290..03ec9f75f 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -550,6 +550,18 @@ TEST(ExecveatTest, Basic) {
                 ArgEnvExitStatus(0, 0), absl::StrCat(absolute_path, "\n"));
 }
 
+TEST(ExecveatTest, FDNotADirectory) {
+  std::string absolute_path = WorkloadPath(kBasicWorkload);
+  std::string relative_path = std::string(Basename(absolute_path));
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(absolute_path, 0));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(fd.get(), relative_path,
+                                            {absolute_path}, {}, /*flags=*/0,
+                                            /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ENOTDIR);
+}
+
 TEST(ExecveatTest, AbsolutePathWithFDCWD) {
   std::string path = WorkloadPath(kBasicWorkload);
   CheckExecveat(AT_FDCWD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
@@ -564,6 +576,56 @@ TEST(ExecveatTest, AbsolutePath) {
                 absl::StrCat(path, "\n"));
 }
 
+TEST(ExecveatTest, EmptyPathBasic) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
+
+  CheckExecveat(fd.get(), "", {path}, {}, AT_EMPTY_PATH, ArgEnvExitStatus(0, 0),
+                absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, EmptyPathWithDirFD) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  std::string parent_dir = std::string(Dirname(path));
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(dirfd.get(), "", {path}, {},
+                                            AT_EMPTY_PATH,
+                                            /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, EACCES);
+}
+
+TEST(ExecveatTest, EmptyPathWithoutEmptyPathFlag) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(
+      fd.get(), "", {path}, {}, /*flags=*/0, /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ENOENT);
+}
+
+TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
+
+  CheckExecveat(fd.get(), path, {path}, {}, AT_EMPTY_PATH,
+                ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, RelativePathWithEmptyPathFlag) {
+  std::string absolute_path = WorkloadPath(kBasicWorkload);
+  std::string parent_dir = std::string(Dirname(absolute_path));
+  std::string relative_path = std::string(Basename(absolute_path));
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
+
+  CheckExecveat(dirfd.get(), relative_path, {absolute_path}, {}, AT_EMPTY_PATH,
+                ArgEnvExitStatus(0, 0), absl::StrCat(absolute_path, "\n"));
+}
+
 // Priority consistent across calls to execve()
 TEST(GetpriorityTest, ExecveMaintainsPriority) {
   int prio = 16;
diff --git a/test/util/multiprocess_util.h b/test/util/multiprocess_util.h
index c413d63ea..61526b4e7 100644
--- a/test/util/multiprocess_util.h
+++ b/test/util/multiprocess_util.h
@@ -109,6 +109,15 @@ PosixErrorOr<Cleanup> ForkAndExecveat(int32_t dirfd, const std::string& pathname
                                       const std::function<void()>& fn,
                                       pid_t* child, int* execve_errno);
 
+inline PosixErrorOr<Cleanup> ForkAndExecveat(int32_t dirfd,
+                                             const std::string& pathname,
+                                             const ExecveArray& argv,
+                                             const ExecveArray& envv, int flags,
+                                             pid_t* child, int* execve_errno) {
+  return ForkAndExecveat(
+      dirfd, pathname, argv, envv, flags, [] {}, child, execve_errno);
+}
+
 // Calls fn in a forked subprocess and returns the exit status of the
 // subprocess.
 //
-- 
cgit v1.2.3


From d9fd5363409facbc5cf04b85b3b0e7dade085dd9 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 24 Oct 2019 01:44:03 -0700
Subject: Handle AT_SYMLINK_NOFOLLOW flag for execveat.

PiperOrigin-RevId: 276441249
---
 pkg/sentry/kernel/kernel.go                |  2 +-
 pkg/sentry/kernel/task_context.go          |  4 +-
 pkg/sentry/loader/elf.go                   |  2 +-
 pkg/sentry/loader/loader.go                | 24 ++++++---
 pkg/sentry/syscalls/linux/linux64_amd64.go |  2 +-
 pkg/sentry/syscalls/linux/sys_thread.go    | 10 ++--
 test/syscalls/linux/exec.cc                | 79 +++++++++++++++++++++++++++---
 7 files changed, 96 insertions(+), 27 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 3cda03891..d70ad5c09 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -805,7 +805,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	// Create a fresh task context.
 	remainingTraversals = uint(args.MaxSymlinkTraversals)
 
-	tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet)
+	tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, true /*resolveFinal*/, k.featureSet)
 	if se != nil {
 		return nil, 0, errors.New(se.String())
 	}
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 8639d379f..1da718b27 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -145,7 +145,7 @@ func (t *Task) Stack() *arch.Stack {
 //  * argv: Binary argv
 //  * envv: Binary envv
 //  * fs: Binary FeatureSet
-func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, file *fs.File, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, file *fs.File, argv, envv []string, resolveFinal bool, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
 	// If File is not nil, we should load that instead of resolving filename.
 	if file != nil {
 		filename = file.MappedName(ctx)
@@ -155,7 +155,7 @@ func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, r
 	m := mm.NewMemoryManager(k, k)
 	defer m.DecUsers(ctx)
 
-	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, envv, k.extraAuxv, k.vdso)
+	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, envv, resolveFinal, k.extraAuxv, k.vdso)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 2d9251e92..86f6b269b 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -640,7 +640,7 @@ func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace
 
 	var interp loadedELF
 	if bin.interpreter != "" {
-		d, i, err := openPath(ctx, mounts, root, wd, maxTraversals, bin.interpreter)
+		d, i, err := openPath(ctx, mounts, root, wd, maxTraversals, bin.interpreter, true /*resolveFinal*/)
 		if err != nil {
 			ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
 			return loadedELF{}, nil, err
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 089d1635b..f5303491d 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -57,13 +57,19 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
 // installed in the Task FDTable. The caller takes ownership of both.
 //
 // name must be a readable, executable, regular file.
-func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string) (*fs.Dirent, *fs.File, error) {
+func openPath(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string, resolveFinal bool) (*fs.Dirent, *fs.File, error) {
+	var err error
 	if name == "" {
 		ctx.Infof("cannot open empty name")
 		return nil, nil, syserror.ENOENT
 	}
 
-	d, err := mm.FindInode(ctx, root, wd, name, maxTraversals)
+	var d *fs.Dirent
+	if resolveFinal {
+		d, err = mounts.FindInode(ctx, root, wd, name, maxTraversals)
+	} else {
+		d, err = mounts.FindLink(ctx, root, wd, name, maxTraversals)
+	}
 	if err != nil {
 		return nil, nil, err
 	}
@@ -71,10 +77,13 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 	// Open file will take a reference to Dirent, so destroy this one.
 	defer d.DecRef()
 
+	if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
+		return nil, nil, syserror.ELOOP
+	}
+
 	return openFile(ctx, nil, d, name)
 }
 
-// openFile performs checks on a file to be executed. If provided a *fs.File,
 // openFile takes that file's Dirent and performs checks on it. If provided a
 // *fs.Dirent and not a *fs.File, it creates a *fs.File object from the Dirent's
 // Inode and performs checks on that.
@@ -181,7 +190,7 @@ const (
 //  * arch.Context matching the binary arch
 //  * fs.Dirent of the binary file
 //  * Possibly updated argv
-func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, features *cpuid.FeatureSet, filename string, passedFile *fs.File, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, features *cpuid.FeatureSet, filename string, passedFile *fs.File, argv []string, resolveFinal bool) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
 	for i := 0; i < maxLoaderAttempts; i++ {
 		var (
 			d   *fs.Dirent
@@ -189,8 +198,7 @@ func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamesp
 			err error
 		)
 		if passedFile == nil {
-			d, f, err = openPath(ctx, mounts, root, wd, remainingTraversals, filename)
-
+			d, f, err = openPath(ctx, mounts, root, wd, remainingTraversals, filename, resolveFinal)
 		} else {
 			d, f, err = openFile(ctx, passedFile, nil, "")
 			// Set to nil in case we loop on a Interpreter Script.
@@ -255,9 +263,9 @@ func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamesp
 // Preconditions:
 //  * The Task MemoryManager is empty.
 //  * Load is called on the Task goroutine.
-func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, file *fs.File, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
+func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, file *fs.File, argv, envv []string, resolveFinal bool, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
 	// Load the binary itself.
-	loaded, ac, d, argv, err := loadBinary(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv)
+	loaded, ac, d, argv, err := loadBinary(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, resolveFinal)
 	if err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", filename, err), syserr.FromError(err).ToLinux())
 	}
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 6d3801ad9..3021440ed 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -362,7 +362,7 @@ var AMD64 = &kernel.SyscallTable{
 		319: syscalls.Supported("memfd_create", MemfdCreate),
 		320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
 		321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
-		322: syscalls.PartiallySupported("execveat", Execveat, "No support for AT_SYMLINK_FOLLOW.", nil),
+		322: syscalls.Supported("execveat", Execveat),
 		323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
 		324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}),  // TODO(b/118904897)
 		325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 7ece7ba6f..effe16186 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -105,16 +105,14 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		}
 	}
 
-	if flags&linux.AT_SYMLINK_NOFOLLOW != 0 {
-		// TODO(b/128449944): Handle AT_SYMLINK_NOFOLLOW.
-		t.Kernel().EmitUnimplementedEvent(t)
-		return 0, nil, syserror.ENOSYS
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
 	}
-
 	atEmptyPath := flags&linux.AT_EMPTY_PATH != 0
 	if !atEmptyPath && len(pathname) == 0 {
 		return 0, nil, syserror.ENOENT
 	}
+	resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0
 
 	root := t.FSContext().RootDirectory()
 	defer root.DecRef()
@@ -150,7 +148,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 
 	// Load the new TaskContext.
 	maxTraversals := uint(linux.MaxSymlinkTraversals)
-	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, pathname, executable, argv, envv, t.Arch().FeatureSet())
+	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, pathname, executable, argv, envv, resolveFinal, t.Arch().FeatureSet())
 	if se != nil {
 		return 0, nil, se.ToError()
 	}
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 03ec9f75f..21a5ffd40 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -542,23 +542,23 @@ TEST(ExecveatTest, BasicWithFDCWD) {
 TEST(ExecveatTest, Basic) {
   std::string absolute_path = WorkloadPath(kBasicWorkload);
   std::string parent_dir = std::string(Dirname(absolute_path));
-  std::string relative_path = std::string(Basename(absolute_path));
+  std::string base = std::string(Basename(absolute_path));
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
 
-  CheckExecveat(dirfd.get(), relative_path, {absolute_path}, {}, /*flags=*/0,
+  CheckExecveat(dirfd.get(), base, {absolute_path}, {}, /*flags=*/0,
                 ArgEnvExitStatus(0, 0), absl::StrCat(absolute_path, "\n"));
 }
 
 TEST(ExecveatTest, FDNotADirectory) {
   std::string absolute_path = WorkloadPath(kBasicWorkload);
-  std::string relative_path = std::string(Basename(absolute_path));
+  std::string base = std::string(Basename(absolute_path));
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(absolute_path, 0));
 
   int execve_errno;
-  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(fd.get(), relative_path,
-                                            {absolute_path}, {}, /*flags=*/0,
-                                            /*child=*/nullptr, &execve_errno));
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(fd.get(), base, {absolute_path}, {},
+                                            /*flags=*/0, /*child=*/nullptr,
+                                            &execve_errno));
   EXPECT_EQ(execve_errno, ENOTDIR);
 }
 
@@ -618,14 +618,77 @@ TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) {
 TEST(ExecveatTest, RelativePathWithEmptyPathFlag) {
   std::string absolute_path = WorkloadPath(kBasicWorkload);
   std::string parent_dir = std::string(Dirname(absolute_path));
-  std::string relative_path = std::string(Basename(absolute_path));
+  std::string base = std::string(Basename(absolute_path));
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
 
-  CheckExecveat(dirfd.get(), relative_path, {absolute_path}, {}, AT_EMPTY_PATH,
+  CheckExecveat(dirfd.get(), base, {absolute_path}, {}, AT_EMPTY_PATH,
                 ArgEnvExitStatus(0, 0), absl::StrCat(absolute_path, "\n"));
 }
 
+TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) {
+  std::string parent_dir = "/tmp";
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload)));
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
+  std::string base = std::string(Basename(link.path()));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(dirfd.get(), base, {base}, {},
+                                            AT_SYMLINK_NOFOLLOW,
+                                            /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ELOOP);
+}
+
+TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) {
+  std::string parent_dir = "/tmp";
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload)));
+  std::string path = link.path();
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(AT_FDCWD, path, {path}, {},
+                                            AT_SYMLINK_NOFOLLOW,
+                                            /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ELOOP);
+}
+
+TEST(ExecveatTest, SymlinkNoFollowAndEmptyPath) {
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+  std::string path = link.path();
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, 0));
+
+  CheckExecveat(fd.get(), "", {path}, {}, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
+                ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, SymlinkNoFollowIgnoreSymlinkAncestor) {
+  TempPath parent_link =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateSymlinkTo("/tmp", "/bin"));
+  std::string path_with_symlink = JoinPath(parent_link.path(), "echo");
+
+  CheckExecveat(AT_FDCWD, path_with_symlink, {path_with_symlink}, {},
+                AT_SYMLINK_NOFOLLOW, ArgEnvExitStatus(0, 0), "");
+}
+
+TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/bin", O_DIRECTORY));
+
+  CheckExecveat(dirfd.get(), "echo", {"echo"}, {}, AT_SYMLINK_NOFOLLOW,
+                ArgEnvExitStatus(0, 0), "");
+}
+
+TEST(ExecveatTest, InvalidFlags) {
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(
+      /*dirfd=*/-1, "", {}, {}, /*flags=*/0xFFFF, /*child=*/nullptr,
+      &execve_errno));
+  EXPECT_EQ(execve_errno, EINVAL);
+}
+
 // Priority consistent across calls to execve()
 TEST(GetpriorityTest, ExecveMaintainsPriority) {
   int prio = 16;
-- 
cgit v1.2.3


From e0c84f284c8cfadc456a5cf3e7cdacbf4f459b96 Mon Sep 17 00:00:00 2001
From: Haibo <Haibo.Xu@arm.com>
Date: Fri, 25 Oct 2019 12:39:20 -0700
Subject: test/syscall:  Remove duplicated gtest/gtest.h.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I05a7ec69b98b88931ba4a8adb3e8a7b822006001
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/1023 from xiaobo55x:syscall_test d44a8b1f827ed4081997af96cd58ba7449e0a9e1
PiperOrigin-RevId: 276740442
---
 test/syscalls/linux/accept_bind.cc                                 | 1 -
 test/syscalls/linux/accept_bind_stream.cc                          | 1 -
 test/syscalls/linux/bind.cc                                        | 1 -
 test/syscalls/linux/chroot.cc                                      | 1 -
 test/syscalls/linux/connect_external.cc                            | 1 -
 test/syscalls/linux/file_base.h                                    | 1 -
 test/syscalls/linux/ioctl.cc                                       | 1 -
 test/syscalls/linux/madvise.cc                                     | 1 -
 test/syscalls/linux/memory_accounting.cc                           | 1 -
 test/syscalls/linux/pipe.cc                                        | 1 -
 test/syscalls/linux/pread64.cc                                     | 1 -
 test/syscalls/linux/preadv.cc                                      | 1 -
 test/syscalls/linux/preadv2.cc                                     | 1 -
 test/syscalls/linux/proc_net.cc                                    | 1 -
 test/syscalls/linux/proc_net_tcp.cc                                | 1 -
 test/syscalls/linux/proc_net_udp.cc                                | 1 -
 test/syscalls/linux/proc_net_unix.cc                               | 1 -
 test/syscalls/linux/pwrite64.cc                                    | 1 -
 test/syscalls/linux/pwritev2.cc                                    | 1 -
 test/syscalls/linux/readv.cc                                       | 1 -
 test/syscalls/linux/readv_common.cc                                | 1 -
 test/syscalls/linux/readv_socket.cc                                | 1 -
 test/syscalls/linux/rename.cc                                      | 1 -
 test/syscalls/linux/select.cc                                      | 1 -
 test/syscalls/linux/sigaltstack.cc                                 | 1 -
 test/syscalls/linux/signalfd.cc                                    | 1 -
 test/syscalls/linux/socket_bind_to_device.cc                       | 1 -
 test/syscalls/linux/socket_bind_to_device_distribution.cc          | 1 -
 test/syscalls/linux/socket_bind_to_device_sequence.cc              | 1 -
 test/syscalls/linux/socket_blocking.cc                             | 1 -
 test/syscalls/linux/socket_generic.cc                              | 1 -
 test/syscalls/linux/socket_ip_tcp_generic.cc                       | 1 -
 test/syscalls/linux/socket_ip_tcp_udp_generic.cc                   | 1 -
 test/syscalls/linux/socket_ip_udp_generic.cc                       | 1 -
 test/syscalls/linux/socket_ip_unbound.cc                           | 1 -
 test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc | 1 -
 test/syscalls/linux/socket_ipv4_udp_unbound.cc                     | 1 -
 test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc | 1 -
 test/syscalls/linux/socket_non_blocking.cc                         | 1 -
 test/syscalls/linux/socket_non_stream.cc                           | 1 -
 test/syscalls/linux/socket_non_stream_blocking.cc                  | 1 -
 test/syscalls/linux/socket_stream.cc                               | 1 -
 test/syscalls/linux/socket_stream_blocking.cc                      | 1 -
 test/syscalls/linux/socket_stream_nonblock.cc                      | 1 -
 test/syscalls/linux/socket_test_util.h                             | 1 -
 test/syscalls/linux/socket_unix.cc                                 | 1 -
 test/syscalls/linux/socket_unix_cmsg.cc                            | 1 -
 test/syscalls/linux/socket_unix_dgram.cc                           | 1 -
 test/syscalls/linux/socket_unix_dgram_non_blocking.cc              | 1 -
 test/syscalls/linux/socket_unix_non_stream.cc                      | 1 -
 test/syscalls/linux/socket_unix_seqpacket.cc                       | 1 -
 test/syscalls/linux/socket_unix_stream.cc                          | 1 -
 test/syscalls/linux/socket_unix_unbound_abstract.cc                | 1 -
 test/syscalls/linux/socket_unix_unbound_dgram.cc                   | 1 -
 test/syscalls/linux/socket_unix_unbound_filesystem.cc              | 1 -
 test/syscalls/linux/socket_unix_unbound_seqpacket.cc               | 1 -
 test/syscalls/linux/socket_unix_unbound_stream.cc                  | 1 -
 test/syscalls/linux/stat.cc                                        | 1 -
 58 files changed, 58 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index 328192a05..427c42ede 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -17,7 +17,6 @@
 #include <algorithm>
 #include <vector>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/accept_bind_stream.cc b/test/syscalls/linux/accept_bind_stream.cc
index b6cdb3f4f..7bcd91e9e 100644
--- a/test/syscalls/linux/accept_bind_stream.cc
+++ b/test/syscalls/linux/accept_bind_stream.cc
@@ -17,7 +17,6 @@
 #include <algorithm>
 #include <vector>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/bind.cc b/test/syscalls/linux/bind.cc
index de8cca53b..9547c4ab2 100644
--- a/test/syscalls/linux/bind.cc
+++ b/test/syscalls/linux/bind.cc
@@ -16,7 +16,6 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 498c45f16..de1611c21 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -24,7 +24,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
diff --git a/test/syscalls/linux/connect_external.cc b/test/syscalls/linux/connect_external.cc
index 98032ac19..bfe1da82e 100644
--- a/test/syscalls/linux/connect_external.cc
+++ b/test/syscalls/linux/connect_external.cc
@@ -21,7 +21,6 @@
 #include <string>
 #include <tuple>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 36efabcae..4d155b618 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -32,7 +32,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
index 4948a76f0..c4f8bff08 100644
--- a/test/syscalls/linux/ioctl.cc
+++ b/test/syscalls/linux/ioctl.cc
@@ -25,7 +25,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/madvise.cc b/test/syscalls/linux/madvise.cc
index 08ff4052c..7fd0ea20c 100644
--- a/test/syscalls/linux/madvise.cc
+++ b/test/syscalls/linux/madvise.cc
@@ -25,7 +25,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/logging.h"
 #include "test/util/memory_util.h"
diff --git a/test/syscalls/linux/memory_accounting.cc b/test/syscalls/linux/memory_accounting.cc
index a6e20f9c3..ff2f49863 100644
--- a/test/syscalls/linux/memory_accounting.cc
+++ b/test/syscalls/linux/memory_accounting.cc
@@ -15,7 +15,6 @@
 #include <sys/mman.h>
 #include <map>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index 10e2a6dfc..c0b354e65 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -19,7 +19,6 @@
 
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
diff --git a/test/syscalls/linux/pread64.cc b/test/syscalls/linux/pread64.cc
index 5e3eb1735..2cecf2e5f 100644
--- a/test/syscalls/linux/pread64.cc
+++ b/test/syscalls/linux/pread64.cc
@@ -19,7 +19,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/temp_path.h"
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
index eebd129f2..f7ea44054 100644
--- a/test/syscalls/linux/preadv.cc
+++ b/test/syscalls/linux/preadv.cc
@@ -21,7 +21,6 @@
 #include <atomic>
 #include <string>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
index aac960130..c9246367d 100644
--- a/test/syscalls/linux/preadv2.cc
+++ b/test/syscalls/linux/preadv2.cc
@@ -20,7 +20,6 @@
 #include <string>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/memory/memory.h"
 #include "test/syscalls/linux/file_base.h"
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index dcfd5f86c..65bad06d4 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -20,7 +20,6 @@
 #include <sys/syscall.h>
 #include <sys/types.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/str_split.h"
 #include "absl/time/clock.h"
diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc
index f61795592..2659f6a98 100644
--- a/test/syscalls/linux/proc_net_tcp.cc
+++ b/test/syscalls/linux/proc_net_tcp.cc
@@ -17,7 +17,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_join.h"
diff --git a/test/syscalls/linux/proc_net_udp.cc b/test/syscalls/linux/proc_net_udp.cc
index 369df8e0e..f06f1a24b 100644
--- a/test/syscalls/linux/proc_net_udp.cc
+++ b/test/syscalls/linux/proc_net_udp.cc
@@ -17,7 +17,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_join.h"
diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
index 83dbd1364..66db0acaa 100644
--- a/test/syscalls/linux/proc_net_unix.cc
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_format.h"
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index e1603fc2d..b48fe540d 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -18,7 +18,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/pwritev2.cc b/test/syscalls/linux/pwritev2.cc
index f6a0fc96c..1dbc0d6df 100644
--- a/test/syscalls/linux/pwritev2.cc
+++ b/test/syscalls/linux/pwritev2.cc
@@ -20,7 +20,6 @@
 #include <string>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc
index f327ec3a9..4069cbc7e 100644
--- a/test/syscalls/linux/readv.cc
+++ b/test/syscalls/linux/readv.cc
@@ -18,7 +18,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/syscalls/linux/readv_common.h"
diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc
index 35d2dd9e3..9658f7d42 100644
--- a/test/syscalls/linux/readv_common.cc
+++ b/test/syscalls/linux/readv_common.cc
@@ -18,7 +18,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc
index 3c315cc02..9b6972201 100644
--- a/test/syscalls/linux/readv_socket.cc
+++ b/test/syscalls/linux/readv_socket.cc
@@ -18,7 +18,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/syscalls/linux/readv_common.h"
diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc
index c9d76c2e2..5b474ff32 100644
--- a/test/syscalls/linux/rename.cc
+++ b/test/syscalls/linux/rename.cc
@@ -16,7 +16,6 @@
 #include <stdio.h>
 #include <string>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
 #include "test/util/capability_util.h"
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index 88c010aec..e06a2666d 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -20,7 +20,6 @@
 #include <csignal>
 #include <cstdio>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/base_poll_test.h"
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index 69b6e4f90..6fd3989a4 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -21,7 +21,6 @@
 #include <functional>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/util/cleanup.h"
 #include "test/util/fs_util.h"
diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc
index 9379d5878..09ecad34a 100644
--- a/test/syscalls/linux/signalfd.cc
+++ b/test/syscalls/linux/signalfd.cc
@@ -23,7 +23,6 @@
 #include <functional>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/synchronization/mutex.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/socket_bind_to_device.cc b/test/syscalls/linux/socket_bind_to_device.cc
index d20821cac..6b27f6eab 100644
--- a/test/syscalls/linux/socket_bind_to_device.cc
+++ b/test/syscalls/linux/socket_bind_to_device.cc
@@ -32,7 +32,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_bind_to_device_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_bind_to_device_distribution.cc b/test/syscalls/linux/socket_bind_to_device_distribution.cc
index 4d2400328..5767181a1 100644
--- a/test/syscalls/linux/socket_bind_to_device_distribution.cc
+++ b/test/syscalls/linux/socket_bind_to_device_distribution.cc
@@ -33,7 +33,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_bind_to_device_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index a7365d139..e4641c62e 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -33,7 +33,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_bind_to_device_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_blocking.cc b/test/syscalls/linux/socket_blocking.cc
index 00c50d1bf..d7ce57566 100644
--- a/test/syscalls/linux/socket_blocking.cc
+++ b/test/syscalls/linux/socket_blocking.cc
@@ -19,7 +19,6 @@
 #include <sys/un.h>
 #include <cstdio>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index 51d614639..e8f24a59e 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -19,7 +19,6 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index bfa7943b1..7e0deda05 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -23,7 +23,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
index de63f79d9..f178f1af9 100644
--- a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
@@ -21,7 +21,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 044394ba7..2a4ed04a5 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -23,7 +23,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc
index fa9a9df6f..b02872308 100644
--- a/test/syscalls/linux/socket_ip_unbound.cc
+++ b/test/syscalls/linux/socket_ip_unbound.cc
@@ -23,7 +23,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
index 3a068aacf..3c3712b50 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
@@ -23,7 +23,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
 
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 67d29af0a..b828b6844 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -20,7 +20,6 @@
 #include <sys/un.h>
 #include <cstdio>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 8b8993d3d..98ae414f3 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -27,7 +27,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_non_blocking.cc b/test/syscalls/linux/socket_non_blocking.cc
index 73e6dc618..c3520cadd 100644
--- a/test/syscalls/linux/socket_non_blocking.cc
+++ b/test/syscalls/linux/socket_non_blocking.cc
@@ -19,7 +19,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_non_stream.cc b/test/syscalls/linux/socket_non_stream.cc
index 3c599b6e8..d91c5ed39 100644
--- a/test/syscalls/linux/socket_non_stream.cc
+++ b/test/syscalls/linux/socket_non_stream.cc
@@ -18,7 +18,6 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_non_stream_blocking.cc b/test/syscalls/linux/socket_non_stream_blocking.cc
index 76127d181..62d87c1af 100644
--- a/test/syscalls/linux/socket_non_stream_blocking.cc
+++ b/test/syscalls/linux/socket_non_stream_blocking.cc
@@ -19,7 +19,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
diff --git a/test/syscalls/linux/socket_stream.cc b/test/syscalls/linux/socket_stream.cc
index 0417dd347..346443f96 100644
--- a/test/syscalls/linux/socket_stream.cc
+++ b/test/syscalls/linux/socket_stream.cc
@@ -19,7 +19,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
index 8367460d2..e9cc082bf 100644
--- a/test/syscalls/linux/socket_stream_blocking.cc
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -19,7 +19,6 @@
 #include <sys/types.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
diff --git a/test/syscalls/linux/socket_stream_nonblock.cc b/test/syscalls/linux/socket_stream_nonblock.cc
index b00748b97..74d608741 100644
--- a/test/syscalls/linux/socket_stream_nonblock.cc
+++ b/test/syscalls/linux/socket_stream_nonblock.cc
@@ -19,7 +19,6 @@
 #include <sys/uio.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index 70710195c..be38907c2 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -29,7 +29,6 @@
 #include <utility>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
 #include "test/util/file_descriptor.h"
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 875f0391f..8a28202a8 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -24,7 +24,6 @@
 
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_cmsg.cc b/test/syscalls/linux/socket_unix_cmsg.cc
index 1092e29b1..1159c5229 100644
--- a/test/syscalls/linux/socket_unix_cmsg.cc
+++ b/test/syscalls/linux/socket_unix_cmsg.cc
@@ -24,7 +24,6 @@
 
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
 #include "test/syscalls/linux/socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc
index 3e0f611d2..3245cf7c9 100644
--- a/test/syscalls/linux/socket_unix_dgram.cc
+++ b/test/syscalls/linux/socket_unix_dgram.cc
@@ -17,7 +17,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
index 707052af8..cd4fba25c 100644
--- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index b5c82cd67..276a94eb8 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -18,7 +18,6 @@
 #include <sys/mman.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_seqpacket.cc b/test/syscalls/linux/socket_unix_seqpacket.cc
index 6f6367dd5..60fa9e38a 100644
--- a/test/syscalls/linux/socket_unix_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket.cc
@@ -17,7 +17,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc
index 8f38ed92f..563467365 100644
--- a/test/syscalls/linux/socket_unix_stream.cc
+++ b/test/syscalls/linux/socket_unix_stream.cc
@@ -16,7 +16,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_abstract.cc b/test/syscalls/linux/socket_unix_unbound_abstract.cc
index 4b5832de8..7f5816ace 100644
--- a/test/syscalls/linux/socket_unix_unbound_abstract.cc
+++ b/test/syscalls/linux/socket_unix_unbound_abstract.cc
@@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_dgram.cc b/test/syscalls/linux/socket_unix_unbound_dgram.cc
index 52aef891f..907dca0f1 100644
--- a/test/syscalls/linux/socket_unix_unbound_dgram.cc
+++ b/test/syscalls/linux/socket_unix_unbound_dgram.cc
@@ -16,7 +16,6 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 
-#include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_filesystem.cc b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
index 8cb03c450..b14f24086 100644
--- a/test/syscalls/linux/socket_unix_unbound_filesystem.cc
+++ b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
@@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
index 0575f2e1d..50ffa1d04 100644
--- a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
@@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc
index e483d2777..344918c34 100644
--- a/test/syscalls/linux/socket_unix_unbound_stream.cc
+++ b/test/syscalls/linux/socket_unix_unbound_stream.cc
@@ -15,7 +15,6 @@
 #include <stdio.h>
 #include <sys/un.h>
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 88ab90b5b..30de2f8ff 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -24,7 +24,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "gtest/gtest.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-- 
cgit v1.2.3


From 29273b03842a85bce8314799348231520ceb6e9c Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 29 Oct 2019 10:03:18 -0700
Subject: Disallow execveat on interpreter scripts with fd opened with
 O_CLOEXEC.

When an interpreter script is opened with O_CLOEXEC and the resulting fd is
passed into execveat, an ENOENT error should occur (the script would otherwise
be inaccessible to the interpreter). This matches the actual behavior of
Linux's execveat.

PiperOrigin-RevId: 277306680
---
 pkg/sentry/kernel/kernel.go             |  1 +
 pkg/sentry/loader/loader.go             |  9 +++++++++
 pkg/sentry/syscalls/linux/sys_thread.go |  5 ++++-
 test/syscalls/linux/exec.cc             | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index fcfe7a16d..e64d648e2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -812,6 +812,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		ResolveFinal:        true,
 		Filename:            args.Filename,
 		File:                args.File,
+		CloseOnExec:         false,
 		Argv:                args.Argv,
 		Envv:                args.Envv,
 		Features:            k.featureSet,
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 818941762..f75ebe08a 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -66,6 +66,12 @@ type LoadArgs struct {
 	// nil, then File will be loaded and Filename will be ignored.
 	File *fs.File
 
+	// CloseOnExec indicates that the executable (or one of its parent
+	// directories) was opened with O_CLOEXEC. If the executable is an
+	// interpreter script, then cause an ENOENT error to occur, since the
+	// script would otherwise be inaccessible to the interpreter.
+	CloseOnExec bool
+
 	// Argv is the vector of arguments to pass to the executable.
 	Argv []string
 
@@ -279,6 +285,9 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 			d.IncRef()
 			return loaded, ac, d, args.Argv, err
 		case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
+			if args.CloseOnExec {
+				return loadedELF{}, nil, nil, nil, syserror.ENOENT
+			}
 			args.Filename, args.Argv, err = parseInterpreterScript(ctx, args.Filename, args.File, args.Argv)
 			if err != nil {
 				ctx.Infof("Error loading interpreter script: %v", err)
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 2476f8858..4115116ff 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -120,6 +120,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 
 	var wd *fs.Dirent
 	var executable *fs.File
+	var closeOnExec bool
 	if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
 		// Even if the pathname is absolute, we may still need the wd
 		// for interpreter scripts if the path of the interpreter is
@@ -127,11 +128,12 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		wd = t.FSContext().WorkingDirectory()
 	} else {
 		// Need to extract the given FD.
-		f := t.GetFile(dirFD)
+		f, fdFlags := t.FDTable().Get(dirFD)
 		if f == nil {
 			return 0, nil, syserror.EBADF
 		}
 		defer f.DecRef()
+		closeOnExec = fdFlags.CloseOnExec
 
 		if atEmptyPath && len(pathname) == 0 {
 			executable = f
@@ -157,6 +159,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
 		ResolveFinal:        resolveFinal,
 		Filename:            pathname,
 		File:                executable,
+		CloseOnExec:         closeOnExec,
 		Argv:                argv,
 		Envv:                envv,
 		Features:            t.Arch().FeatureSet(),
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 21a5ffd40..a9067df2a 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -681,6 +681,39 @@ TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
                 ArgEnvExitStatus(0, 0), "");
 }
 
+TEST(ExecveatTest, BasicWithCloexecFD) {
+  std::string path = WorkloadPath(kBasicWorkload);
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
+
+  CheckExecveat(fd.get(), "", {path}, {}, AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH,
+                ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, InterpreterScriptWithCloexecFD) {
+  std::string path = WorkloadPath(kExitScript);
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(fd.get(), "", {path}, {},
+                                            AT_EMPTY_PATH, /*child=*/nullptr,
+                                            &execve_errno));
+  EXPECT_EQ(execve_errno, ENOENT);
+}
+
+TEST(ExecveatTest, InterpreterScriptWithCloexecDirFD) {
+  std::string absolute_path = WorkloadPath(kExitScript);
+  std::string parent_dir = std::string(Dirname(absolute_path));
+  std::string base = std::string(Basename(absolute_path));
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_CLOEXEC | O_DIRECTORY));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(dirfd.get(), base, {base}, {},
+                                            /*flags=*/0, /*child=*/nullptr,
+                                            &execve_errno));
+  EXPECT_EQ(execve_errno, ENOENT);
+}
+
 TEST(ExecveatTest, InvalidFlags) {
   int execve_errno;
   ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(
-- 
cgit v1.2.3


From 392c56149531c82ef3c07e2899939c0d63f0980b Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 29 Oct 2019 12:15:33 -0700
Subject: Fix PollWithFullBufferBlocks.

Set the snd/rcv buffer sizes so that the test is deterministic and runs in a
reasonable amount of time. It also ensures that we disable any auto-tuning of
the send/receive buffer which may happen.

PiperOrigin-RevId: 277337232
---
 test/syscalls/linux/tcp_socket.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index bfa031bce..277d6835a 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -394,8 +394,15 @@ TEST_P(TcpSocketTest, PollWithFullBufferBlocks) {
                          sizeof(tcp_nodelay_flag)),
               SyscallSucceeds());
 
+  // Set a 256KB send/receive buffer.
+  int buf_sz = 1 << 18;
+  EXPECT_THAT(setsockopt(t_, SOL_SOCKET, SO_RCVBUF, &buf_sz, sizeof(buf_sz)),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &buf_sz, sizeof(buf_sz)),
+              SyscallSucceedsWithValue(0));
+
   // Create a large buffer that will be used for sending.
-  std::vector<char> buf(10 * sendbuf_size_);
+  std::vector<char> buf(1 << 16);
 
   // Write until we receive an error.
   while (RetryEINTR(send)(s_, buf.data(), buf.size(), 0) != -1) {
-- 
cgit v1.2.3


From 38330e93774e68324d8f43adb27178453dee18b6 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 29 Oct 2019 13:58:09 -0700
Subject: Update symlink traversal limit when resolving interpreter path.

When execveat is called on an interpreter script, the symlink count for
resolving the script path should be separate from the count for resolving the
the corresponding interpreter. An ELOOP error should not occur if we do not hit
the symlink limit along any individual path, even if the total number of
symlinks encountered exceeds the limit.

Closes #574

PiperOrigin-RevId: 277358474
---
 pkg/sentry/loader/elf.go    |  2 ++
 pkg/sentry/loader/loader.go |  2 ++
 test/syscalls/linux/exec.cc | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 3ea037e4d..c2c3ec06e 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -644,6 +644,8 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error
 		// resolved, the interpreter should still be resolved if it is
 		// a symlink.
 		args.ResolveFinal = true
+		// Refresh the traversal limit.
+		*args.RemainingTraversals = linux.MaxSymlinkTraversals
 		args.Filename = bin.interpreter
 		d, i, err := openPath(ctx, args)
 		if err != nil {
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 803e7d41e..b03eeb005 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -293,6 +293,8 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
 				ctx.Infof("Error loading interpreter script: %v", err)
 				return loadedELF{}, nil, nil, nil, err
 			}
+			// Refresh the traversal limit for the interpreter.
+			*args.RemainingTraversals = linux.MaxSymlinkTraversals
 		default:
 			ctx.Infof("Unknown magic: %v", hdr)
 			return loadedELF{}, nil, nil, nil, syserror.ENOEXEC
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index a9067df2a..581f03533 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -533,6 +533,47 @@ TEST(ExecTest, CloexecEventfd) {
             W_EXITCODE(0, 0), "");
 }
 
+constexpr int kLinuxMaxSymlinks = 40;
+
+TEST(ExecTest, SymlinkLimitExceeded) {
+  std::string path = WorkloadPath(kBasicWorkload);
+
+  // Hold onto TempPath objects so they are not destructed prematurely.
+  std::vector<TempPath> symlinks;
+  for (int i = 0; i < kLinuxMaxSymlinks + 1; i++) {
+    symlinks.push_back(
+        ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateSymlinkTo("/tmp", path)));
+    path = symlinks[i].path();
+  }
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(path, {path}, {}, /*child=*/nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, ELOOP);
+}
+
+TEST(ExecTest, SymlinkLimitRefreshedForInterpreter) {
+  std::string tmp_dir = "/tmp";
+  std::string interpreter_path = "/bin/echo";
+  TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      tmp_dir, absl::StrCat("#!", interpreter_path), 0755));
+  std::string script_path = script.path();
+
+  // Hold onto TempPath objects so they are not destructed prematurely.
+  std::vector<TempPath> interpreter_symlinks;
+  std::vector<TempPath> script_symlinks;
+  for (int i = 0; i < kLinuxMaxSymlinks; i++) {
+    interpreter_symlinks.push_back(ASSERT_NO_ERRNO_AND_VALUE(
+        TempPath::CreateSymlinkTo(tmp_dir, interpreter_path)));
+    interpreter_path = interpreter_symlinks[i].path();
+    script_symlinks.push_back(ASSERT_NO_ERRNO_AND_VALUE(
+        TempPath::CreateSymlinkTo(tmp_dir, script_path)));
+    script_path = script_symlinks[i].path();
+  }
+
+  CheckExec(script_path, {script_path}, {}, ArgEnvExitStatus(0, 0), "");
+}
+
 TEST(ExecveatTest, BasicWithFDCWD) {
   std::string path = WorkloadPath(kBasicWorkload);
   CheckExecveat(AT_FDCWD, path, {path}, {}, /*flags=*/0, ArgEnvExitStatus(0, 0),
-- 
cgit v1.2.3


From 8bc7b8dba2dcc339ab5bd1b05c83f74a6211a7d0 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 30 Oct 2019 13:29:56 -0700
Subject: Clean up typos in test names.

PiperOrigin-RevId: 277572791
---
 test/syscalls/linux/socket_ip_tcp_generic.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 7e0deda05..592448289 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -30,7 +30,7 @@
 namespace gvisor {
 namespace testing {
 
-TEST_P(TCPSocketPairTest, TcpInfoSucceedes) {
+TEST_P(TCPSocketPairTest, TcpInfoSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct tcp_info opt = {};
@@ -39,7 +39,7 @@ TEST_P(TCPSocketPairTest, TcpInfoSucceedes) {
               SyscallSucceeds());
 }
 
-TEST_P(TCPSocketPairTest, ShortTcpInfoSucceedes) {
+TEST_P(TCPSocketPairTest, ShortTcpInfoSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct tcp_info opt = {};
@@ -48,7 +48,7 @@ TEST_P(TCPSocketPairTest, ShortTcpInfoSucceedes) {
               SyscallSucceeds());
 }
 
-TEST_P(TCPSocketPairTest, ZeroTcpInfoSucceedes) {
+TEST_P(TCPSocketPairTest, ZeroTcpInfoSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct tcp_info opt = {};
-- 
cgit v1.2.3


From db37483cb6acf55b66132d534bb734f09555b1cf Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 30 Oct 2019 15:32:20 -0700
Subject: Store endpoints inside multiPortEndpoint in a sorted order

It is required to guarantee the same order of endpoints after save/restore.

PiperOrigin-RevId: 277598665
---
 pkg/tcpip/stack/registration.go             |   3 +
 pkg/tcpip/stack/stack.go                    |  29 ++++++++
 pkg/tcpip/stack/transport_demuxer.go        |  10 +++
 pkg/tcpip/stack/transport_test.go           |  11 ++-
 pkg/tcpip/transport/icmp/endpoint.go        |   7 ++
 pkg/tcpip/transport/tcp/endpoint.go         |   7 ++
 pkg/tcpip/transport/udp/endpoint.go         |   7 ++
 runsc/boot/loader.go                        |   5 +-
 test/syscalls/linux/socket_inet_loopback.cc | 107 ++++++++++++++++++++++++++++
 9 files changed, 181 insertions(+), 5 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 0360187b8..94015ba54 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -60,6 +60,9 @@ const (
 // TransportEndpoint is the interface that needs to be implemented by transport
 // protocol (e.g., tcp, udp) endpoints that can handle packets.
 type TransportEndpoint interface {
+	// UniqueID returns an unique ID for this transport endpoint.
+	UniqueID() uint64
+
 	// HandlePacket is called by the stack when new packets arrive to
 	// this transport endpoint.
 	HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView)
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 6d6ddc0ff..115a6fcb8 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -22,6 +22,7 @@ package stack
 import (
 	"encoding/binary"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"golang.org/x/time/rate"
@@ -344,6 +345,13 @@ type ResumableEndpoint interface {
 	Resume(*Stack)
 }
 
+// uniqueIDGenerator is a default unique ID generator.
+type uniqueIDGenerator uint64
+
+func (u *uniqueIDGenerator) UniqueID() uint64 {
+	return atomic.AddUint64((*uint64)(u), 1)
+}
+
 // Stack is a networking stack, with all supported protocols, NICs, and route
 // table.
 type Stack struct {
@@ -411,6 +419,14 @@ type Stack struct {
 	// ndpDisp is the NDP event dispatcher that is used to send the netstack
 	// integrator NDP related events.
 	ndpDisp NDPDispatcher
+
+	// uniqueIDGenerator is a generator of unique identifiers.
+	uniqueIDGenerator UniqueID
+}
+
+// UniqueID is an abstract generator of unique identifiers.
+type UniqueID interface {
+	UniqueID() uint64
 }
 
 // Options contains optional Stack configuration.
@@ -434,6 +450,9 @@ type Options struct {
 	// stack (false).
 	HandleLocal bool
 
+	// UniqueID is an optional generator of unique identifiers.
+	UniqueID UniqueID
+
 	// NDPConfigs is the default NDP configurations used by interfaces.
 	//
 	// By default, NDPConfigs will have a zero value for its
@@ -506,6 +525,10 @@ func New(opts Options) *Stack {
 		clock = &tcpip.StdClock{}
 	}
 
+	if opts.UniqueID == nil {
+		opts.UniqueID = new(uniqueIDGenerator)
+	}
+
 	// Make sure opts.NDPConfigs contains valid values only.
 	opts.NDPConfigs.validate()
 
@@ -524,6 +547,7 @@ func New(opts Options) *Stack {
 		portSeed:             generateRandUint32(),
 		ndpConfigs:           opts.NDPConfigs,
 		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
+		uniqueIDGenerator:    opts.UniqueID,
 		ndpDisp:              opts.NDPDisp,
 	}
 
@@ -551,6 +575,11 @@ func New(opts Options) *Stack {
 	return s
 }
 
+// UniqueID returns a unique identifier.
+func (s *Stack) UniqueID() uint64 {
+	return s.uniqueIDGenerator.UniqueID()
+}
+
 // SetNetworkProtocolOption allows configuring individual protocol level
 // options. This method returns an error if the protocol is not supported or
 // option is not supported by the protocol implementation or the provided value
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index f633632f0..ccd3d030e 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -17,6 +17,7 @@ package stack
 import (
 	"fmt"
 	"math/rand"
+	"sort"
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -310,6 +311,15 @@ func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, reusePo
 	// endpointsMap. This will allow us to remove endpoint from the array fast.
 	ep.endpointsMap[t] = len(ep.endpointsArr)
 	ep.endpointsArr = append(ep.endpointsArr, t)
+
+	// ep.endpointsArr is sorted by endpoint unique IDs, so that endpoints
+	// can be restored in the same order.
+	sort.Slice(ep.endpointsArr, func(i, j int) bool {
+		return ep.endpointsArr[i].UniqueID() < ep.endpointsArr[j].UniqueID()
+	})
+	for i, e := range ep.endpointsArr {
+		ep.endpointsMap[e] = i
+	}
 	return nil
 }
 
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index ae6fda3a9..203e79f56 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -43,6 +43,7 @@ type fakeTransportEndpoint struct {
 	proto    *fakeTransportProtocol
 	peerAddr tcpip.Address
 	route    stack.Route
+	uniqueID uint64
 
 	// acceptQueue is non-nil iff bound.
 	acceptQueue []fakeTransportEndpoint
@@ -56,8 +57,8 @@ func (f *fakeTransportEndpoint) Stats() tcpip.EndpointStats {
 	return nil
 }
 
-func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber) tcpip.Endpoint {
-	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto}
+func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint {
+	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
 }
 
 func (f *fakeTransportEndpoint) Close() {
@@ -144,6 +145,10 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	return nil
 }
 
+func (f *fakeTransportEndpoint) UniqueID() uint64 {
+	return f.uniqueID
+}
+
 func (f *fakeTransportEndpoint) ConnectEndpoint(e tcpip.Endpoint) *tcpip.Error {
 	return nil
 }
@@ -251,7 +256,7 @@ func (*fakeTransportProtocol) Number() tcpip.TransportProtocolNumber {
 }
 
 func (f *fakeTransportProtocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return newFakeTransportEndpoint(stack, f, netProto), nil
+	return newFakeTransportEndpoint(stack, f, netProto, stack.UniqueID()), nil
 }
 
 func (f *fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index d0dd383fd..114a69b4e 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -58,6 +58,7 @@ type endpoint struct {
 	// immutable.
 	stack       *stack.Stack `state:"manual"`
 	waiterQueue *waiter.Queue
+	uniqueID    uint64
 
 	// The following fields are used to manage the receive queue, and are
 	// protected by rcvMu.
@@ -90,9 +91,15 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSize:    32 * 1024,
 		state:         stateInitial,
+		uniqueID:      s.UniqueID(),
 	}, nil
 }
 
+// UniqueID implements stack.TransportEndpoint.UniqueID.
+func (e *endpoint) UniqueID() uint64 {
+	return e.uniqueID
+}
+
 // Close puts the endpoint in a closed state and frees all resources
 // associated with it.
 func (e *endpoint) Close() {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 8a3ca0f1b..a1efd8d55 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -287,6 +287,7 @@ type endpoint struct {
 	// change throughout the lifetime of the endpoint.
 	stack       *stack.Stack  `state:"manual"`
 	waiterQueue *waiter.Queue `state:"wait"`
+	uniqueID    uint64
 
 	// lastError represents the last error that the endpoint reported;
 	// access to it is protected by the following mutex.
@@ -504,6 +505,11 @@ type endpoint struct {
 	stats Stats `state:"nosave"`
 }
 
+// UniqueID implements stack.TransportEndpoint.UniqueID.
+func (e *endpoint) UniqueID() uint64 {
+	return e.uniqueID
+}
+
 // calculateAdvertisedMSS calculates the MSS to advertise.
 //
 // If userMSS is non-zero and is not greater than the maximum possible MSS for
@@ -565,6 +571,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 			interval: 75 * time.Second,
 			count:    9,
 		},
+		uniqueID: s.UniqueID(),
 	}
 
 	var ss SendBufferSizeOption
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index cda302bb7..68977dc25 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -80,6 +80,7 @@ type endpoint struct {
 	// change throughout the lifetime of the endpoint.
 	stack       *stack.Stack `state:"manual"`
 	waiterQueue *waiter.Queue
+	uniqueID    uint64
 
 	// The following fields are used to manage the receive queue, and are
 	// protected by rcvMu.
@@ -160,9 +161,15 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSize:    32 * 1024,
 		state:         StateInitial,
+		uniqueID:      s.UniqueID(),
 	}
 }
 
+// UniqueID implements stack.TransportEndpoint.UniqueID.
+func (e *endpoint) UniqueID() uint64 {
+	return e.uniqueID
+}
+
 // Close puts the endpoint in a closed state and frees all resources
 // associated with it.
 func (e *endpoint) Close() {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 0c0eba99e..86df384f8 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -232,7 +232,7 @@ func New(args Args) (*Loader, error) {
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
-	networkStack, err := newEmptyNetworkStack(args.Conf, k)
+	networkStack, err := newEmptyNetworkStack(args.Conf, k, k)
 	if err != nil {
 		return nil, fmt.Errorf("creating network: %v", err)
 	}
@@ -905,7 +905,7 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
+func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
 	switch conf.Network {
 	case NetworkHost:
 		return hostinet.NewStack(), nil
@@ -923,6 +923,7 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 			// Enable raw sockets for users with sufficient
 			// privileges.
 			RawFactory: raw.EndpointFactory{},
+			UniqueID:   uniqueID,
 		})}
 
 		// Enable SACK Recovery.
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 322ee07ad..ab375aaaf 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -16,6 +16,7 @@
 #include <netinet/in.h>
 #include <poll.h>
 #include <string.h>
+#include <sys/epoll.h>
 #include <sys/socket.h>
 
 #include <atomic>
@@ -516,6 +517,112 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
                 EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
 }
 
+TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+  sockaddr_storage listen_addr = listener.addr;
+  sockaddr_storage conn_addr = connector.addr;
+  constexpr int kThreadCount = 3;
+
+  // TODO(b/141211329): endpointsByNic.seed has to be saved/restored.
+  const DisableSave ds141211329;
+
+  // Create listening sockets.
+  FileDescriptor listener_fds[kThreadCount];
+  for (int i = 0; i < kThreadCount; i++) {
+    listener_fds[i] =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(listener.family(), SOCK_DGRAM, 0));
+    int fd = listener_fds[i].get();
+
+    ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                           sizeof(kSockOptOn)),
+                SyscallSucceeds());
+    ASSERT_THAT(
+        bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
+        SyscallSucceeds());
+
+    // On the first bind we need to determine which port was bound.
+    if (i != 0) {
+      continue;
+    }
+
+    // Get the port bound by the listening socket.
+    socklen_t addrlen = listener.addr_len;
+    ASSERT_THAT(
+        getsockname(listener_fds[0].get(),
+                    reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+        SyscallSucceeds());
+    uint16_t const port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+    ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
+    ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  }
+
+  constexpr int kConnectAttempts = 10;
+  FileDescriptor client_fds[kConnectAttempts];
+
+  // Do the first run without save/restore.
+  DisableSave ds;
+  for (int i = 0; i < kConnectAttempts; i++) {
+    client_fds[i] =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(connector.family(), SOCK_DGRAM, 0));
+    EXPECT_THAT(RetryEINTR(sendto)(client_fds[i].get(), &i, sizeof(i), 0,
+                                   reinterpret_cast<sockaddr*>(&conn_addr),
+                                   connector.addr_len),
+                SyscallSucceedsWithValue(sizeof(i)));
+  }
+  ds.reset();
+
+  // Check that a mapping of client and server sockets has
+  // not been change after save/restore.
+  for (int i = 0; i < kConnectAttempts; i++) {
+    EXPECT_THAT(RetryEINTR(sendto)(client_fds[i].get(), &i, sizeof(i), 0,
+                                   reinterpret_cast<sockaddr*>(&conn_addr),
+                                   connector.addr_len),
+                SyscallSucceedsWithValue(sizeof(i)));
+  }
+
+  int epollfd;
+  ASSERT_THAT(epollfd = epoll_create1(0), SyscallSucceeds());
+
+  for (int i = 0; i < kThreadCount; i++) {
+    int fd = listener_fds[i].get();
+    struct epoll_event ev;
+    ev.data.fd = fd;
+    ev.events = EPOLLIN;
+    ASSERT_THAT(epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev), SyscallSucceeds());
+  }
+
+  std::map<uint16_t, int> portToFD;
+
+  for (int i = 0; i < kConnectAttempts * 2; i++) {
+    struct sockaddr_storage addr = {};
+    socklen_t addrlen = sizeof(addr);
+    struct epoll_event ev;
+    int data, fd;
+
+    ASSERT_THAT(epoll_wait(epollfd, &ev, 1, -1), SyscallSucceedsWithValue(1));
+
+    fd = ev.data.fd;
+    EXPECT_THAT(RetryEINTR(recvfrom)(fd, &data, sizeof(data), 0,
+                                     reinterpret_cast<struct sockaddr*>(&addr),
+                                     &addrlen),
+                SyscallSucceedsWithValue(sizeof(data)));
+    uint16_t const port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr));
+    auto prev_port = portToFD.find(port);
+    // Check that all packets from one client have been delivered to the same
+    // server socket.
+    if (prev_port == portToFD.end()) {
+      portToFD[port] = ev.data.fd;
+    } else {
+      EXPECT_EQ(portToFD[port], ev.data.fd);
+    }
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetReusePortTest,
     ::testing::Values(
-- 
cgit v1.2.3


From af6af2c34131c4ec5e3195be99c1deb6a2669c06 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 1 Nov 2019 11:21:06 -0700
Subject: tests: don't use ASSERT_THAT after fork

PiperOrigin-RevId: 277965624
---
 test/syscalls/linux/semaphore.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
index 40c57f543..e9b131ca9 100644
--- a/test/syscalls/linux/semaphore.cc
+++ b/test/syscalls/linux/semaphore.cc
@@ -447,9 +447,8 @@ TEST(SemaphoreTest, SemCtlGetPidFork) {
 
   const pid_t child_pid = fork();
   if (child_pid == 0) {
-    ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 1), SyscallSucceeds());
-    ASSERT_THAT(semctl(sem.get(), 0, GETPID),
-                SyscallSucceedsWithValue(getpid()));
+    TEST_PCHECK(semctl(sem.get(), 0, SETVAL, 1) == 0);
+    TEST_PCHECK(semctl(sem.get(), 0, GETPID) == getpid());
 
     _exit(0);
   }
-- 
cgit v1.2.3


From 515fee5b6d4f3270c951f72283aef79a28d463dd Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 1 Nov 2019 12:42:04 -0700
Subject: Add SO_PASSCRED support to netlink sockets

Since we only supporting sending messages from the kernel, the peer is always
the kernel, simplifying handling.

There are currently no known users of SO_PASSCRED that would actually receive
messages from gVisor, but adding full support is barely more work than stubbing
out fake support.

Updates #1117
Fixes #1119

PiperOrigin-RevId: 277981465
---
 pkg/sentry/socket/netlink/BUILD             |   1 +
 pkg/sentry/socket/netlink/socket.go         |  76 ++++++++++++++++++-
 test/syscalls/linux/socket_netlink_route.cc | 110 +++++++++++++++++++++++++++-
 3 files changed, 183 insertions(+), 4 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index f95803f91..79589e3c8 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -20,6 +20,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index b2732ca29..05dac4f0a 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
@@ -61,7 +62,7 @@ var netlinkSocketDevice = device.NewAnonDevice()
 // This implementation only supports userspace sending and receiving messages
 // to/from the kernel.
 //
-// Socket implements socket.Socket.
+// Socket implements socket.Socket and transport.Credentialer.
 //
 // +stateify savable
 type Socket struct {
@@ -104,9 +105,13 @@ type Socket struct {
 	// sendBufferSize is the send buffer "size". We don't actually have a
 	// fixed buffer but only consume this many bytes.
 	sendBufferSize uint32
+
+	// passcred indicates if this socket wants SCM credentials.
+	passcred bool
 }
 
 var _ socket.Socket = (*Socket)(nil)
+var _ transport.Credentialer = (*Socket)(nil)
 
 // NewSocket creates a new Socket.
 func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socket, *syserr.Error) {
@@ -172,6 +177,22 @@ func (s *Socket) EventUnregister(e *waiter.Entry) {
 	s.ep.EventUnregister(e)
 }
 
+// Passcred implements transport.Credentialer.Passcred.
+func (s *Socket) Passcred() bool {
+	s.mu.Lock()
+	passcred := s.passcred
+	s.mu.Unlock()
+	return passcred
+}
+
+// ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
+func (s *Socket) ConnectedPasscred() bool {
+	// This socket is connected to the kernel, which doesn't need creds.
+	//
+	// This is arbitrary, as ConnectedPasscred on this type has no callers.
+	return false
+}
+
 // Ioctl implements fs.FileOperations.Ioctl.
 func (*Socket) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArguments) (uintptr, error) {
 	// TODO(b/68878065): no ioctls supported.
@@ -309,9 +330,20 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.
 			// We don't have limit on receiving size.
 			return int32(math.MaxInt32), nil
 
+		case linux.SO_PASSCRED:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+			var passcred int32
+			if s.Passcred() {
+				passcred = 1
+			}
+			return passcred, nil
+
 		default:
 			socket.GetSockOptEmitUnimplementedEvent(t, name)
 		}
+
 	case linux.SOL_NETLINK:
 		switch name {
 		case linux.NETLINK_BROADCAST_ERROR,
@@ -348,6 +380,7 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 			s.sendBufferSize = size
 			s.mu.Unlock()
 			return nil
+
 		case linux.SO_RCVBUF:
 			if len(opt) < sizeOfInt32 {
 				return syserr.ErrInvalidArgument
@@ -355,6 +388,18 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 			// We don't have limit on receiving size. So just accept anything as
 			// valid for compatibility.
 			return nil
+
+		case linux.SO_PASSCRED:
+			if len(opt) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+			passcred := usermem.ByteOrder.Uint32(opt)
+
+			s.mu.Lock()
+			s.passcred = passcred != 0
+			s.mu.Unlock()
+			return nil
+
 		default:
 			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
@@ -483,6 +528,26 @@ func (s *Socket) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _
 	})
 }
 
+// kernelSCM implements control.SCMCredentials with credentials that represent
+// the kernel itself rather than a Task.
+//
+// +stateify savable
+type kernelSCM struct{}
+
+// Equals implements transport.CredentialsControlMessage.Equals.
+func (kernelSCM) Equals(oc transport.CredentialsControlMessage) bool {
+	_, ok := oc.(kernelSCM)
+	return ok
+}
+
+// Credentials implements control.SCMCredentials.Credentials.
+func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) {
+	return 0, auth.RootUID, auth.RootGID
+}
+
+// kernelCreds is the concrete version of kernelSCM used in all creds.
+var kernelCreds = &kernelSCM{}
+
 // sendResponse sends the response messages in ms back to userspace.
 func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error {
 	// Linux combines multiple netlink messages into a single datagram.
@@ -491,10 +556,15 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 		bufs = append(bufs, m.Finalize())
 	}
 
+	// All messages are from the kernel.
+	cms := transport.ControlMessages{
+		Credentials: kernelCreds,
+	}
+
 	if len(bufs) > 0 {
 		// RecvMsg never receives the address, so we don't need to send
 		// one.
-		_, notify, err := s.connection.Send(bufs, transport.ControlMessages{}, tcpip.FullAddress{})
+		_, notify, err := s.connection.Send(bufs, cms, tcpip.FullAddress{})
 		// If the buffer is full, we simply drop messages, just like
 		// Linux.
 		if err != nil && err != syserr.ErrWouldBlock {
@@ -521,7 +591,7 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 		// Add the dump_done_errno payload.
 		m.Put(int64(0))
 
-		_, notify, err := s.connection.Send([][]byte{m.Finalize()}, transport.ControlMessages{}, tcpip.FullAddress{})
+		_, notify, err := s.connection.Send([][]byte{m.Finalize()}, cms, tcpip.FullAddress{})
 		if err != nil && err != syserr.ErrWouldBlock {
 			return err
 		}
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index dd4a11655..be0dadcd6 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -195,7 +195,8 @@ INSTANTIATE_TEST_SUITE_P(
         std::make_tuple(SO_DOMAIN, IsEqual(AF_NETLINK),
                         absl::StrFormat("AF_NETLINK (%d)", AF_NETLINK)),
         std::make_tuple(SO_PROTOCOL, IsEqual(NETLINK_ROUTE),
-                        absl::StrFormat("NETLINK_ROUTE (%d)", NETLINK_ROUTE))));
+                        absl::StrFormat("NETLINK_ROUTE (%d)", NETLINK_ROUTE)),
+        std::make_tuple(SO_PASSCRED, IsEqual(0), "0")));
 
 // Validates the reponses to RTM_GETLINK + NLM_F_DUMP.
 void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
@@ -692,6 +693,113 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
   } while (type != NLMSG_DONE && type != NLMSG_ERROR);
 }
 
+// No SCM_CREDENTIALS are received without SO_PASSCRED set.
+TEST(NetlinkRouteTest, NoPasscredNoCreds) {
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+
+  ASSERT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct rtgenmsg rgm;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req;
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETADDR;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = kSeq;
+  req.rgm.rtgen_family = AF_UNSPEC;
+
+  struct iovec iov = {};
+  iov.iov_base = &req;
+  iov.iov_len = sizeof(req);
+
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+  iov.iov_base = NULL;
+  iov.iov_len = 0;
+
+  char control[CMSG_SPACE(sizeof(struct ucred))] = {};
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  // Note: This test assumes at least one message is returned by the
+  // RTM_GETADDR request.
+  ASSERT_THAT(RetryEINTR(recvmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+  // No control messages.
+  EXPECT_EQ(CMSG_FIRSTHDR(&msg), nullptr);
+}
+
+// SCM_CREDENTIALS are received with SO_PASSCRED set.
+TEST(NetlinkRouteTest, PasscredCreds) {
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+
+  ASSERT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct rtgenmsg rgm;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req;
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETADDR;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = kSeq;
+  req.rgm.rtgen_family = AF_UNSPEC;
+
+  struct iovec iov = {};
+  iov.iov_base = &req;
+  iov.iov_len = sizeof(req);
+
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+  iov.iov_base = NULL;
+  iov.iov_len = 0;
+
+  char control[CMSG_SPACE(sizeof(struct ucred))] = {};
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  // Note: This test assumes at least one message is returned by the
+  // RTM_GETADDR request.
+  ASSERT_THAT(RetryEINTR(recvmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+  struct ucred creds;
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(creds)));
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+
+  memcpy(&creds, CMSG_DATA(cmsg), sizeof(creds));
+
+  // The peer is the kernel, which is "PID" 0.
+  EXPECT_EQ(creds.pid, 0);
+  // The kernel identifies as root. Also allow nobody in case this test is
+  // running in a userns without root mapped.
+  EXPECT_THAT(creds.uid, AnyOf(Eq(0), Eq(65534)));
+  EXPECT_THAT(creds.gid, AnyOf(Eq(0), Eq(65534)));
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From b23b36e701c40827065217f4652a51eebc5f9913 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 4 Nov 2019 10:06:00 -0800
Subject: Add NETLINK_KOBJECT_UEVENT socket support

NETLINK_KOBJECT_UEVENT sockets send udev-style messages for device events.
gVisor doesn't have any device events, so our sockets don't need to do anything
once created.

systemd's device manager needs to be able to create one of these sockets. It
also wants to install a BPF filter on the socket. Since we'll never send any
messages, the filter would never be invoked, thus we just fake it out.

Fixes #1117
Updates #1119

PiperOrigin-RevId: 278405893
---
 pkg/sentry/socket/netlink/provider.go        |   7 ++
 pkg/sentry/socket/netlink/route/protocol.go  |   5 +
 pkg/sentry/socket/netlink/socket.go          |  42 ++++++++
 pkg/sentry/socket/netlink/uevent/BUILD       |  17 +++
 pkg/sentry/socket/netlink/uevent/protocol.go |  60 +++++++++++
 runsc/boot/BUILD                             |   1 +
 runsc/boot/loader.go                         |   1 +
 test/syscalls/BUILD                          |   4 +
 test/syscalls/linux/BUILD                    |  29 +++++
 test/syscalls/linux/socket_netdevice.cc      |   3 +-
 test/syscalls/linux/socket_netlink.cc        | 153 +++++++++++++++++++++++++++
 test/syscalls/linux/socket_netlink_route.cc  | 140 ++++--------------------
 test/syscalls/linux/socket_netlink_uevent.cc |  83 +++++++++++++++
 test/syscalls/linux/socket_netlink_util.cc   |   5 +-
 test/syscalls/linux/socket_netlink_util.h    |   5 +-
 15 files changed, 431 insertions(+), 124 deletions(-)
 create mode 100644 pkg/sentry/socket/netlink/uevent/BUILD
 create mode 100644 pkg/sentry/socket/netlink/uevent/protocol.go
 create mode 100644 test/syscalls/linux/socket_netlink.cc
 create mode 100644 test/syscalls/linux/socket_netlink_uevent.cc

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 689cad997..be005df24 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -30,6 +30,13 @@ type Protocol interface {
 	// Protocol returns the Linux netlink protocol value.
 	Protocol() int
 
+	// CanSend returns true if this protocol may ever send messages.
+	//
+	// TODO(gvisor.dev/issue/1119): This is a workaround to allow
+	// advertising support for otherwise unimplemented features on sockets
+	// that will never send messages, thus making those features no-ops.
+	CanSend() bool
+
 	// ProcessMessage processes a single message from userspace.
 	//
 	// If err == nil, any messages added to ms will be sent back to the
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index cc70ac237..6b4a0ecf4 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -61,6 +61,11 @@ func (p *Protocol) Protocol() int {
 	return linux.NETLINK_ROUTE
 }
 
+// CanSend implements netlink.Protocol.CanSend.
+func (p *Protocol) CanSend() bool {
+	return true
+}
+
 // dumpLinks handles RTM_GETLINK + NLM_F_DUMP requests.
 func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
 	// NLM_F_DUMP + RTM_GETLINK messages are supposed to include an
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 05dac4f0a..4a1b87a9a 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -54,6 +54,8 @@ const (
 	maxSendBufferSize = 4 << 20 // 4MB
 )
 
+var errNoFilter = syserr.New("no filter attached", linux.ENOENT)
+
 // netlinkSocketDevice is the netlink socket virtual device.
 var netlinkSocketDevice = device.NewAnonDevice()
 
@@ -108,6 +110,12 @@ type Socket struct {
 
 	// passcred indicates if this socket wants SCM credentials.
 	passcred bool
+
+	// filter indicates that this socket has a BPF filter "installed".
+	//
+	// TODO(gvisor.dev/issue/1119): We don't actually support filtering,
+	// this is just bookkeeping for tracking add/remove.
+	filter bool
 }
 
 var _ socket.Socket = (*Socket)(nil)
@@ -400,6 +408,40 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 			s.mu.Unlock()
 			return nil
 
+		case linux.SO_ATTACH_FILTER:
+			// TODO(gvisor.dev/issue/1119): We don't actually
+			// support filtering. If this socket can't ever send
+			// messages, then there is nothing to filter and we can
+			// advertise support. Otherwise, be conservative and
+			// return an error.
+			if s.protocol.CanSend() {
+				socket.SetSockOptEmitUnimplementedEvent(t, name)
+				return syserr.ErrProtocolNotAvailable
+			}
+
+			s.mu.Lock()
+			s.filter = true
+			s.mu.Unlock()
+			return nil
+
+		case linux.SO_DETACH_FILTER:
+			// TODO(gvisor.dev/issue/1119): See above.
+			if s.protocol.CanSend() {
+				socket.SetSockOptEmitUnimplementedEvent(t, name)
+				return syserr.ErrProtocolNotAvailable
+			}
+
+			s.mu.Lock()
+			filter := s.filter
+			s.filter = false
+			s.mu.Unlock()
+
+			if !filter {
+				return errNoFilter
+			}
+
+			return nil
+
 		default:
 			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
diff --git a/pkg/sentry/socket/netlink/uevent/BUILD b/pkg/sentry/socket/netlink/uevent/BUILD
new file mode 100644
index 000000000..0777f3baf
--- /dev/null
+++ b/pkg/sentry/socket/netlink/uevent/BUILD
@@ -0,0 +1,17 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "uevent",
+    srcs = ["protocol.go"],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/socket/netlink",
+        "//pkg/syserr",
+    ],
+)
diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go
new file mode 100644
index 000000000..b5d7808d7
--- /dev/null
+++ b/pkg/sentry/socket/netlink/uevent/protocol.go
@@ -0,0 +1,60 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package uevent provides a NETLINK_KOBJECT_UEVENT socket protocol.
+//
+// NETLINK_KOBJECT_UEVENT sockets send udev-style device events. gVisor does
+// not support any device events, so these sockets never send any messages.
+package uevent
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
+	"gvisor.dev/gvisor/pkg/syserr"
+)
+
+// Protocol implements netlink.Protocol.
+//
+// +stateify savable
+type Protocol struct{}
+
+var _ netlink.Protocol = (*Protocol)(nil)
+
+// NewProtocol creates a NETLINK_KOBJECT_UEVENT netlink.Protocol.
+func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) {
+	return &Protocol{}, nil
+}
+
+// Protocol implements netlink.Protocol.Protocol.
+func (p *Protocol) Protocol() int {
+	return linux.NETLINK_KOBJECT_UEVENT
+}
+
+// CanSend implements netlink.Protocol.CanSend.
+func (p *Protocol) CanSend() bool {
+	return false
+}
+
+// ProcessMessage implements netlink.Protocol.ProcessMessage.
+func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+	// Silently ignore all messages.
+	return nil
+}
+
+// init registers the NETLINK_KOBJECT_UEVENT provider.
+func init() {
+	netlink.RegisterProvider(linux.NETLINK_KOBJECT_UEVENT, NewProtocol)
+}
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 6fe2b57de..58e86ae7f 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -60,6 +60,7 @@ go_library(
         "//pkg/sentry/socket/hostinet",
         "//pkg/sentry/socket/netlink",
         "//pkg/sentry/socket/netlink/route",
+        "//pkg/sentry/socket/netlink/uevent",
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/state",
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 4d1bd2d08..f05d5973f 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -65,6 +65,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 )
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index a53a23afd..3e5b6b3c3 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -511,8 +511,12 @@ syscall_test(test = "//test/syscalls/linux:socket_ip_unbound_test")
 
 syscall_test(test = "//test/syscalls/linux:socket_netdevice_test")
 
+syscall_test(test = "//test/syscalls/linux:socket_netlink_test")
+
 syscall_test(test = "//test/syscalls/linux:socket_netlink_route_test")
 
+syscall_test(test = "//test/syscalls/linux:socket_netlink_uevent_test")
+
 syscall_test(test = "//test/syscalls/linux:socket_blocking_local_test")
 
 syscall_test(test = "//test/syscalls/linux:socket_blocking_ip_test")
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 833fbaa09..93bff8299 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2675,6 +2675,20 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "socket_netlink_test",
+    testonly = 1,
+    srcs = ["socket_netlink.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "socket_netlink_route_test",
     testonly = 1,
@@ -2692,6 +2706,21 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "socket_netlink_uevent_test",
+    testonly = 1,
+    srcs = ["socket_netlink_uevent.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_netlink_util",
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 # These socket tests are in a library because the test cases are shared
 # across several test build targets.
 cc_library(
diff --git a/test/syscalls/linux/socket_netdevice.cc b/test/syscalls/linux/socket_netdevice.cc
index 765f8e0e4..405dbbd73 100644
--- a/test/syscalls/linux/socket_netdevice.cc
+++ b/test/syscalls/linux/socket_netdevice.cc
@@ -68,7 +68,8 @@ TEST(NetdeviceTest, Netmask) {
 
   // Use a netlink socket to get the netmask, which we'll then compare to the
   // netmask obtained via ioctl.
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
diff --git a/test/syscalls/linux/socket_netlink.cc b/test/syscalls/linux/socket_netlink.cc
new file mode 100644
index 000000000..4ec0fd4fa
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink.cc
@@ -0,0 +1,153 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/netlink.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Tests for all netlink socket protocols.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// NetlinkTest parameter is the protocol to test.
+using NetlinkTest = ::testing::TestWithParam<int>;
+
+// Netlink sockets must be SOCK_DGRAM or SOCK_RAW.
+TEST_P(NetlinkTest, Types) {
+  const int protocol = GetParam();
+
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_STREAM, protocol),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_SEQPACKET, protocol),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_RDM, protocol),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_DCCP, protocol),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+  EXPECT_THAT(socket(AF_NETLINK, SOCK_PACKET, protocol),
+              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+
+  int fd;
+  EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_DGRAM, protocol), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_RAW, protocol), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_P(NetlinkTest, AutomaticPort) {
+  const int protocol = GetParam();
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, protocol));
+
+  struct sockaddr_nl addr = {};
+  addr.nl_family = AF_NETLINK;
+
+  EXPECT_THAT(
+      bind(fd.get(), reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallSucceeds());
+
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, sizeof(addr));
+  // This is the only netlink socket in the process, so it should get the PID as
+  // the port id.
+  //
+  // N.B. Another process could theoretically have explicitly reserved our pid
+  // as a port ID, but that is very unlikely.
+  EXPECT_EQ(addr.nl_pid, getpid());
+}
+
+// Calling connect automatically binds to an automatic port.
+TEST_P(NetlinkTest, ConnectBinds) {
+  const int protocol = GetParam();
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, protocol));
+
+  struct sockaddr_nl addr = {};
+  addr.nl_family = AF_NETLINK;
+
+  EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      sizeof(addr)),
+              SyscallSucceeds());
+
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, sizeof(addr));
+
+  // Each test is running in a pid namespace, so another process can explicitly
+  // reserve our pid as a port ID. In this case, a negative portid value will be
+  // set.
+  if (static_cast<pid_t>(addr.nl_pid) > 0) {
+    EXPECT_EQ(addr.nl_pid, getpid());
+  }
+
+  memset(&addr, 0, sizeof(addr));
+  addr.nl_family = AF_NETLINK;
+
+  // Connecting again is allowed, but keeps the same port.
+  EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      sizeof(addr)),
+              SyscallSucceeds());
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, sizeof(addr));
+  EXPECT_EQ(addr.nl_pid, getpid());
+}
+
+TEST_P(NetlinkTest, GetPeerName) {
+  const int protocol = GetParam();
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, protocol));
+
+  struct sockaddr_nl addr = {};
+  socklen_t addrlen = sizeof(addr);
+
+  EXPECT_THAT(getpeername(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                          &addrlen),
+              SyscallSucceeds());
+
+  EXPECT_EQ(addrlen, sizeof(addr));
+  EXPECT_EQ(addr.nl_family, AF_NETLINK);
+  // Peer is the kernel if we didn't connect elsewhere.
+  EXPECT_EQ(addr.nl_pid, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(ProtocolTest, NetlinkTest,
+                         ::testing::Values(NETLINK_ROUTE,
+                                           NETLINK_KOBJECT_UEVENT));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index be0dadcd6..ef567f512 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -41,112 +41,7 @@ namespace {
 using ::testing::AnyOf;
 using ::testing::Eq;
 
-// Netlink sockets must be SOCK_DGRAM or SOCK_RAW.
-TEST(NetlinkRouteTest, Types) {
-  EXPECT_THAT(socket(AF_NETLINK, SOCK_STREAM, NETLINK_ROUTE),
-              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
-  EXPECT_THAT(socket(AF_NETLINK, SOCK_SEQPACKET, NETLINK_ROUTE),
-              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
-  EXPECT_THAT(socket(AF_NETLINK, SOCK_RDM, NETLINK_ROUTE),
-              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
-  EXPECT_THAT(socket(AF_NETLINK, SOCK_DCCP, NETLINK_ROUTE),
-              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
-  EXPECT_THAT(socket(AF_NETLINK, SOCK_PACKET, NETLINK_ROUTE),
-              SyscallFailsWithErrno(ESOCKTNOSUPPORT));
-
-  int fd;
-  EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE),
-              SyscallSucceeds());
-  EXPECT_THAT(close(fd), SyscallSucceeds());
-
-  EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE),
-              SyscallSucceeds());
-  EXPECT_THAT(close(fd), SyscallSucceeds());
-}
-
-TEST(NetlinkRouteTest, AutomaticPort) {
-  FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
-
-  struct sockaddr_nl addr = {};
-  addr.nl_family = AF_NETLINK;
-
-  EXPECT_THAT(
-      bind(fd.get(), reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
-      SyscallSucceeds());
-
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                          &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, sizeof(addr));
-  // This is the only netlink socket in the process, so it should get the PID as
-  // the port id.
-  //
-  // N.B. Another process could theoretically have explicitly reserved our pid
-  // as a port ID, but that is very unlikely.
-  EXPECT_EQ(addr.nl_pid, getpid());
-}
-
-// Calling connect automatically binds to an automatic port.
-TEST(NetlinkRouteTest, ConnectBinds) {
-  FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
-
-  struct sockaddr_nl addr = {};
-  addr.nl_family = AF_NETLINK;
-
-  EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                      sizeof(addr)),
-              SyscallSucceeds());
-
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                          &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, sizeof(addr));
-
-  // Each test is running in a pid namespace, so another process can explicitly
-  // reserve our pid as a port ID. In this case, a negative portid value will be
-  // set.
-  if (static_cast<pid_t>(addr.nl_pid) > 0) {
-    EXPECT_EQ(addr.nl_pid, getpid());
-  }
-
-  memset(&addr, 0, sizeof(addr));
-  addr.nl_family = AF_NETLINK;
-
-  // Connecting again is allowed, but keeps the same port.
-  EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                      sizeof(addr)),
-              SyscallSucceeds());
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                          &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, sizeof(addr));
-  EXPECT_EQ(addr.nl_pid, getpid());
-}
-
-TEST(NetlinkRouteTest, GetPeerName) {
-  FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
-
-  struct sockaddr_nl addr = {};
-  socklen_t addrlen = sizeof(addr);
-
-  EXPECT_THAT(getpeername(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
-                          &addrlen),
-              SyscallSucceeds());
-
-  EXPECT_EQ(addrlen, sizeof(addr));
-  EXPECT_EQ(addr.nl_family, AF_NETLINK);
-  // Peer is the kernel if we didn't connect elsewhere.
-  EXPECT_EQ(addr.nl_pid, 0);
-}
-
-// Parameters for GetSockOpt test. They are:
+// Parameters for SockOptTest. They are:
 // 0: Socket option to query.
 // 1: A predicate to run on the returned sockopt value. Should return true if
 //    the value is considered ok.
@@ -219,7 +114,8 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
 }
 
 TEST(NetlinkRouteTest, GetLinkDump) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
@@ -260,7 +156,8 @@ TEST(NetlinkRouteTest, GetLinkDump) {
 }
 
 TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
@@ -293,7 +190,8 @@ TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
 }
 
 TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
@@ -332,7 +230,8 @@ TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
 }
 
 TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
@@ -373,7 +272,8 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
 }
 
 TEST(NetlinkRouteTest, ControlMessageIgnored) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
@@ -408,7 +308,8 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) {
 }
 
 TEST(NetlinkRouteTest, GetAddrDump) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
@@ -468,7 +369,8 @@ TEST(NetlinkRouteTest, LookupAll) {
 
 // GetRouteDump tests a RTM_GETROUTE + NLM_F_DUMP request.
 TEST(NetlinkRouteTest, GetRouteDump) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
@@ -544,7 +446,8 @@ TEST(NetlinkRouteTest, GetRouteDump) {
 // buffer. MSG_TRUNC with a zero length buffer should consume subsequent
 // messages off the socket.
 TEST(NetlinkRouteTest, RecvmsgTrunc) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
@@ -620,7 +523,8 @@ TEST(NetlinkRouteTest, RecvmsgTrunc) {
 // it, so a properly sized buffer can be allocated to store the message. This
 // test tests that scenario.
 TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
@@ -695,7 +599,8 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
 
 // No SCM_CREDENTIALS are received without SO_PASSCRED set.
 TEST(NetlinkRouteTest, NoPasscredNoCreds) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   ASSERT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOff,
                          sizeof(kSockOptOff)),
@@ -742,7 +647,8 @@ TEST(NetlinkRouteTest, NoPasscredNoCreds) {
 
 // SCM_CREDENTIALS are received with SO_PASSCRED set.
 TEST(NetlinkRouteTest, PasscredCreds) {
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   ASSERT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOn,
                          sizeof(kSockOptOn)),
diff --git a/test/syscalls/linux/socket_netlink_uevent.cc b/test/syscalls/linux/socket_netlink_uevent.cc
new file mode 100644
index 000000000..da425bed4
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_uevent.cc
@@ -0,0 +1,83 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/filter.h>
+#include <linux/netlink.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Tests for NETLINK_KOBJECT_UEVENT sockets.
+//
+// gVisor never sends any messages on these sockets, so we don't test the events
+// themselves.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// SO_PASSCRED can be enabled. Since no messages are sent in gVisor, we don't
+// actually test receiving credentials.
+TEST(NetlinkUeventTest, PassCred) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_KOBJECT_UEVENT));
+
+  EXPECT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+}
+
+// SO_DETACH_FILTER fails without a filter already installed.
+TEST(NetlinkUeventTest, DetachNoFilter) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_KOBJECT_UEVENT));
+
+  int opt;
+  EXPECT_THAT(
+      setsockopt(fd.get(), SOL_SOCKET, SO_DETACH_FILTER, &opt, sizeof(opt)),
+      SyscallFailsWithErrno(ENOENT));
+}
+
+// We can attach a BPF filter.
+TEST(NetlinkUeventTest, AttachFilter) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_KOBJECT_UEVENT));
+
+  // Minimal BPF program: a single ret.
+  struct sock_filter filter = {0x6, 0, 0, 0};
+  struct sock_fprog prog = {};
+  prog.len = 1;
+  prog.filter = &filter;
+
+  EXPECT_THAT(
+      setsockopt(fd.get(), SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog)),
+      SyscallSucceeds());
+
+  int opt;
+  EXPECT_THAT(
+      setsockopt(fd.get(), SOL_SOCKET, SO_DETACH_FILTER, &opt, sizeof(opt)),
+      SyscallSucceeds());
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index fcb8f8a88..5f05bab10 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -16,7 +16,6 @@
 
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
-#include <linux/rtnetlink.h>
 
 #include <vector>
 
@@ -27,9 +26,9 @@
 namespace gvisor {
 namespace testing {
 
-PosixErrorOr<FileDescriptor> NetlinkBoundSocket() {
+PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol) {
   FileDescriptor fd;
-  ASSIGN_OR_RETURN_ERRNO(fd, Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
+  ASSIGN_OR_RETURN_ERRNO(fd, Socket(AF_NETLINK, SOCK_RAW, protocol));
 
   struct sockaddr_nl addr = {};
   addr.nl_family = AF_NETLINK;
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index db8639a2f..da99f0d60 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -17,7 +17,6 @@
 
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
-#include <linux/rtnetlink.h>
 
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
@@ -25,8 +24,8 @@
 namespace gvisor {
 namespace testing {
 
-// Returns a bound NETLINK_ROUTE socket.
-PosixErrorOr<FileDescriptor> NetlinkBoundSocket();
+// Returns a bound netlink socket.
+PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol);
 
 // Returns the port ID of the passed socket.
 PosixErrorOr<uint32_t> NetlinkPortID(int fd);
-- 
cgit v1.2.3


From 493334f8b594eb1c2b0f5a6133dbedad4e0ecd32 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 4 Nov 2019 15:59:11 -0800
Subject: kokoro: run KVM syscall tests

We don't know how stable they are, so let's start with warning.

PiperOrigin-RevId: 278484186
---
 kokoro/syscall_kvm_tests.cfg  |  9 +++++++++
 scripts/syscall_kvm_tests.sh  | 21 +++++++++++++++++++++
 test/syscalls/linux/itimer.cc |  6 ++++++
 3 files changed, 36 insertions(+)
 create mode 100644 kokoro/syscall_kvm_tests.cfg
 create mode 100755 scripts/syscall_kvm_tests.sh

(limited to 'test/syscalls/linux')

diff --git a/kokoro/syscall_kvm_tests.cfg b/kokoro/syscall_kvm_tests.cfg
new file mode 100644
index 000000000..3b99e9c13
--- /dev/null
+++ b/kokoro/syscall_kvm_tests.cfg
@@ -0,0 +1,9 @@
+build_file: "repo/scripts/syscall_kvm_tests.sh"
+
+action {
+  define_artifacts {
+    regex: "**/sponge_log.xml"
+    regex: "**/sponge_log.log"
+    regex: "**/outputs.zip"
+  }
+}
diff --git a/scripts/syscall_kvm_tests.sh b/scripts/syscall_kvm_tests.sh
new file mode 100755
index 000000000..de85daa5a
--- /dev/null
+++ b/scripts/syscall_kvm_tests.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+# TODO(b/112165693): "test --test_tag_filters=runsc_kvm" can be used
+# when the "manual" tag will be removed for kvm tests.
+test `bazel query "attr(tags, runsc_kvm, tests(//test/syscalls/...))"`
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index 930d2b940..b77e4cbd1 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -267,6 +267,9 @@ int TestSIGPROFFairness(absl::Duration sleep) {
 // Random save/restore is disabled as it introduces additional latency and
 // unpredictable distribution patterns.
 TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) {
+  // TODO(b/143247272): CPU time accounting is inaccurate for the KVM platform.
+  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+
   pid_t child;
   int execve_errno;
   auto kill = ASSERT_NO_ERRNO_AND_VALUE(
@@ -288,6 +291,9 @@ TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) {
 // Random save/restore is disabled as it introduces additional latency and
 // unpredictable distribution patterns.
 TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyIdle_NoRandomSave) {
+  // TODO(b/143247272): CPU time accounting is inaccurate for the KVM platform.
+  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+
   pid_t child;
   int execve_errno;
   auto kill = ASSERT_NO_ERRNO_AND_VALUE(
-- 
cgit v1.2.3


From e1b21f3c8ca989dc94b25526fda1bb107691f1af Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 6 Nov 2019 14:24:38 -0800
Subject: Use PacketBuffers, rather than VectorisedViews, in netstack.

PacketBuffers are analogous to Linux's sk_buff. They hold all information about
a packet, headers, and payload. This is important for:

* iptables to access various headers of packets
* Preventing the clutter of passing different net and link headers along with
  VectorisedViews to packet handling functions.

This change only affects the incoming packet path, and a future change will
change the outgoing path.

Benchmark               Regular         PacketBufferPtr  PacketBufferConcrete
--------------------------------------------------------------------------------
BM_Recvmsg             400.715MB/s      373.676MB/s      396.276MB/s
BM_Sendmsg             361.832MB/s      333.003MB/s      335.571MB/s
BM_Recvfrom            453.336MB/s      393.321MB/s      381.650MB/s
BM_Sendto              378.052MB/s      372.134MB/s      341.342MB/s
BM_SendmsgTCP/0/1k     353.711MB/s      316.216MB/s      322.747MB/s
BM_SendmsgTCP/0/2k     600.681MB/s      588.776MB/s      565.050MB/s
BM_SendmsgTCP/0/4k     995.301MB/s      888.808MB/s      941.888MB/s
BM_SendmsgTCP/0/8k     1.517GB/s        1.274GB/s        1.345GB/s
BM_SendmsgTCP/0/16k    1.872GB/s        1.586GB/s        1.698GB/s
BM_SendmsgTCP/0/32k    1.017GB/s        1.020GB/s        1.133GB/s
BM_SendmsgTCP/0/64k    475.626MB/s      584.587MB/s      627.027MB/s
BM_SendmsgTCP/0/128k   416.371MB/s      503.434MB/s      409.850MB/s
BM_SendmsgTCP/0/256k   323.449MB/s      449.599MB/s      388.852MB/s
BM_SendmsgTCP/0/512k   243.992MB/s      267.676MB/s      314.474MB/s
BM_SendmsgTCP/0/1M     95.138MB/s       95.874MB/s       95.417MB/s
BM_SendmsgTCP/0/2M     96.261MB/s       94.977MB/s       96.005MB/s
BM_SendmsgTCP/0/4M     96.512MB/s       95.978MB/s       95.370MB/s
BM_SendmsgTCP/0/8M     95.603MB/s       95.541MB/s       94.935MB/s
BM_SendmsgTCP/0/16M    94.598MB/s       94.696MB/s       94.521MB/s
BM_SendmsgTCP/0/32M    94.006MB/s       94.671MB/s       94.768MB/s
BM_SendmsgTCP/0/64M    94.133MB/s       94.333MB/s       94.746MB/s
BM_SendmsgTCP/0/128M   93.615MB/s       93.497MB/s       93.573MB/s
BM_SendmsgTCP/0/256M   93.241MB/s       95.100MB/s       93.272MB/s
BM_SendmsgTCP/1/1k     303.644MB/s      316.074MB/s      308.430MB/s
BM_SendmsgTCP/1/2k     537.093MB/s      584.962MB/s      529.020MB/s
BM_SendmsgTCP/1/4k     882.362MB/s      939.087MB/s      892.285MB/s
BM_SendmsgTCP/1/8k     1.272GB/s        1.394GB/s        1.296GB/s
BM_SendmsgTCP/1/16k    1.802GB/s        2.019GB/s        1.830GB/s
BM_SendmsgTCP/1/32k    2.084GB/s        2.173GB/s        2.156GB/s
BM_SendmsgTCP/1/64k    2.515GB/s        2.463GB/s        2.473GB/s
BM_SendmsgTCP/1/128k   2.811GB/s        3.004GB/s        2.946GB/s
BM_SendmsgTCP/1/256k   3.008GB/s        3.159GB/s        3.171GB/s
BM_SendmsgTCP/1/512k   2.980GB/s        3.150GB/s        3.126GB/s
BM_SendmsgTCP/1/1M     2.165GB/s        2.233GB/s        2.163GB/s
BM_SendmsgTCP/1/2M     2.370GB/s        2.219GB/s        2.453GB/s
BM_SendmsgTCP/1/4M     2.005GB/s        2.091GB/s        2.214GB/s
BM_SendmsgTCP/1/8M     2.111GB/s        2.013GB/s        2.109GB/s
BM_SendmsgTCP/1/16M    1.902GB/s        1.868GB/s        1.897GB/s
BM_SendmsgTCP/1/32M    1.655GB/s        1.665GB/s        1.635GB/s
BM_SendmsgTCP/1/64M    1.575GB/s        1.547GB/s        1.575GB/s
BM_SendmsgTCP/1/128M   1.524GB/s        1.584GB/s        1.580GB/s
BM_SendmsgTCP/1/256M   1.579GB/s        1.607GB/s        1.593GB/s

PiperOrigin-RevId: 278940079
---
 pkg/tcpip/BUILD                                    |  2 +
 pkg/tcpip/link/channel/channel.go                  | 10 ++--
 pkg/tcpip/link/fdbased/endpoint.go                 |  4 +-
 pkg/tcpip/link/fdbased/endpoint_test.go            | 27 ++++-----
 pkg/tcpip/link/fdbased/mmap.go                     |  5 +-
 pkg/tcpip/link/fdbased/packet_dispatchers.go       | 18 ++++--
 pkg/tcpip/link/loopback/loopback.go                | 10 +++-
 pkg/tcpip/link/muxed/injectable.go                 |  4 +-
 pkg/tcpip/link/sharedmem/sharedmem.go              |  7 ++-
 pkg/tcpip/link/sharedmem/sharedmem_test.go         |  9 ++-
 pkg/tcpip/link/sniffer/sniffer.go                  | 12 ++--
 pkg/tcpip/link/waitable/waitable.go                |  4 +-
 pkg/tcpip/link/waitable/waitable_test.go           |  8 +--
 pkg/tcpip/network/arp/arp.go                       |  4 +-
 pkg/tcpip/network/arp/arp_test.go                  |  4 +-
 pkg/tcpip/network/ip_test.go                       | 34 ++++++++----
 pkg/tcpip/network/ipv4/icmp.go                     | 34 +++++++-----
 pkg/tcpip/network/ipv4/ipv4.go                     | 43 +++++++++------
 pkg/tcpip/network/ipv4/ipv4_test.go                |  4 +-
 pkg/tcpip/network/ipv6/icmp.go                     | 48 ++++++++--------
 pkg/tcpip/network/ipv6/icmp_test.go                | 24 +++++---
 pkg/tcpip/network/ipv6/ipv6.go                     | 28 ++++++----
 pkg/tcpip/network/ipv6/ipv6_test.go                |  8 ++-
 pkg/tcpip/network/ipv6/ndp_test.go                 |  8 ++-
 pkg/tcpip/packet_buffer.go                         | 54 ++++++++++++++++++
 pkg/tcpip/packet_buffer_state.go                   | 26 +++++++++
 pkg/tcpip/stack/ndp_test.go                        |  4 +-
 pkg/tcpip/stack/nic.go                             | 48 ++++++++--------
 pkg/tcpip/stack/registration.go                    | 64 +++++++++++++---------
 pkg/tcpip/stack/stack.go                           |  4 +-
 pkg/tcpip/stack/stack_test.go                      | 50 +++++++++++------
 pkg/tcpip/stack/transport_demuxer.go               | 53 +++++++++---------
 pkg/tcpip/stack/transport_demuxer_test.go          |  4 +-
 pkg/tcpip/stack/transport_test.go                  | 34 ++++++++----
 pkg/tcpip/transport/icmp/endpoint.go               | 18 +++---
 pkg/tcpip/transport/icmp/protocol.go               |  2 +-
 pkg/tcpip/transport/packet/endpoint.go             | 19 ++++---
 pkg/tcpip/transport/raw/endpoint.go                | 17 +++---
 pkg/tcpip/transport/tcp/endpoint.go                |  6 +-
 pkg/tcpip/transport/tcp/forwarder.go               |  5 +-
 pkg/tcpip/transport/tcp/protocol.go                |  4 +-
 pkg/tcpip/transport/tcp/segment.go                 |  5 +-
 pkg/tcpip/transport/tcp/testing/context/context.go | 16 ++++--
 pkg/tcpip/transport/udp/endpoint.go                | 20 +++----
 pkg/tcpip/transport/udp/forwarder.go               |  9 ++-
 pkg/tcpip/transport/udp/protocol.go                | 30 +++++-----
 pkg/tcpip/transport/udp/udp_test.go                | 19 +++++--
 test/syscalls/linux/raw_socket_icmp.cc             |  2 +-
 48 files changed, 542 insertions(+), 330 deletions(-)
 create mode 100644 pkg/tcpip/packet_buffer.go
 create mode 100644 pkg/tcpip/packet_buffer_state.go

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 3c2b2b5ea..65d4d0cd8 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -6,6 +6,8 @@ package(licenses = ["notice"])
 go_library(
     name = "tcpip",
     srcs = [
+        "packet_buffer.go",
+        "packet_buffer_state.go",
         "tcpip.go",
         "time_unsafe.go",
     ],
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 14f197a77..22eefb564 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -65,14 +65,14 @@ func (e *Endpoint) Drain() int {
 	}
 }
 
-// Inject injects an inbound packet.
-func (e *Endpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
-	e.InjectLinkAddr(protocol, "", vv)
+// InjectInbound injects an inbound packet.
+func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	e.InjectLinkAddr(protocol, "", pkt)
 }
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, vv buffer.VectorisedView) {
-	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, vv.Clone(nil), nil /* linkHeader */)
+func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt tcpip.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
 }
 
 // Attach saves the stack network-layer dispatcher for use later when packets
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index ae4858529..edef7db26 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -598,8 +598,8 @@ func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
+func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // NewInjectable creates a new fd-based InjectableEndpoint.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index e7c05ca4f..7e08e033b 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -43,10 +43,9 @@ const (
 )
 
 type packetInfo struct {
-	raddr      tcpip.LinkAddress
-	proto      tcpip.NetworkProtocolNumber
-	contents   buffer.VectorisedView
-	linkHeader buffer.View
+	raddr    tcpip.LinkAddress
+	proto    tcpip.NetworkProtocolNumber
+	contents tcpip.PacketBuffer
 }
 
 type context struct {
@@ -93,8 +92,8 @@ func (c *context) cleanup() {
 	syscall.Close(c.fds[1])
 }
 
-func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
-	c.ch <- packetInfo{remote, protocol, vv, linkHeader}
+func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	c.ch <- packetInfo{remote, protocol, pkt}
 }
 
 func TestNoEthernetProperties(t *testing.T) {
@@ -317,19 +316,21 @@ func TestDeliverPacket(t *testing.T) {
 				select {
 				case pi := <-c.ch:
 					want := packetInfo{
-						raddr:      raddr,
-						proto:      proto,
-						contents:   buffer.View(b).ToVectorisedView(),
-						linkHeader: buffer.View(hdr),
+						raddr: raddr,
+						proto: proto,
+						contents: tcpip.PacketBuffer{
+							Data:       buffer.View(b).ToVectorisedView(),
+							LinkHeader: buffer.View(hdr),
+						},
 					}
 					if !eth {
 						want.proto = header.IPv4ProtocolNumber
 						want.raddr = ""
 					}
-					// want.contents will be a single view,
-					// so make pi do the same for the
+					// want.contents.Data will be a single
+					// view, so make pi do the same for the
 					// DeepEqual check.
-					pi.contents = pi.contents.ToView().ToVectorisedView()
+					pi.contents.Data = pi.contents.Data.ToView().ToVectorisedView()
 					if !reflect.DeepEqual(want, pi) {
 						t.Fatalf("Unexpected received packet: %+v, want %+v", pi, want)
 					}
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index 554d45715..62ed1e569 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -190,6 +190,9 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	pkt = pkt[d.e.hdrSize:]
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}), buffer.View(eth))
+	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, tcpip.PacketBuffer{
+		Data:       buffer.View(pkt).ToVectorisedView(),
+		LinkHeader: buffer.View(eth),
+	})
 	return true, nil
 }
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
index 3331b6453..c67d684ce 100644
--- a/pkg/tcpip/link/fdbased/packet_dispatchers.go
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -139,10 +139,13 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	used := d.capViews(n, BufConfig)
-	vv := buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...))
-	vv.TrimFront(d.e.hdrSize)
+	pkt := tcpip.PacketBuffer{
+		Data:       buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)),
+		LinkHeader: buffer.View(eth),
+	}
+	pkt.Data.TrimFront(d.e.hdrSize)
 
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth))
+	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
 
 	// Prepare e.views for another packet: release used views.
 	for i := 0; i < used; i++ {
@@ -293,9 +296,12 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 		}
 
 		used := d.capViews(k, int(n), BufConfig)
-		vv := buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...))
-		vv.TrimFront(d.e.hdrSize)
-		d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth))
+		pkt := tcpip.PacketBuffer{
+			Data:       buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)),
+			LinkHeader: buffer.View(eth),
+		}
+		pkt.Data.TrimFront(d.e.hdrSize)
+		d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
 
 		// Prepare e.views for another packet: release used views.
 		for i := 0; i < used; i++ {
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index a3b48fa73..bc5d8a2f3 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -80,12 +80,13 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prependa
 	views := make([]buffer.View, 1, 1+len(payload.Views()))
 	views[0] = hdr.View()
 	views = append(views, payload.Views()...)
-	vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
 
 	// Because we're immediately turning around and writing the packet back to the
 	// rx path, we intentionally don't preserve the remote and local link
 	// addresses from the stack.Route we're passed.
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, tcpip.PacketBuffer{
+		Data: buffer.NewVectorisedView(len(views[0])+payload.Size(), views),
+	})
 
 	return nil
 }
@@ -105,7 +106,10 @@ func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
 	// There should be an ethernet header at the beginning of packet.
 	linkHeader := header.Ethernet(packet.First()[:header.EthernetMinimumSize])
 	packet.TrimFront(len(linkHeader))
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), packet, buffer.View(linkHeader))
+	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), tcpip.PacketBuffer{
+		Data:       packet,
+		LinkHeader: buffer.View(linkHeader),
+	})
 
 	return nil
 }
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 682b60291..9a8e8ebfe 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -80,8 +80,8 @@ func (m *InjectableEndpoint) IsAttached() bool {
 }
 
 // InjectInbound implements stack.InjectableLinkEndpoint.
-func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
-	m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
+func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
+	m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // WritePackets writes outbound packets to the appropriate
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 279e2b457..2bace5298 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -273,8 +273,11 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
 		}
 
 		// Send packet up the stack.
-		eth := header.Ethernet(b)
-		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(), buffer.View(eth))
+		eth := header.Ethernet(b[:header.EthernetMinimumSize])
+		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), tcpip.PacketBuffer{
+			Data:       buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(),
+			LinkHeader: buffer.View(eth),
+		})
 	}
 
 	// Clean state.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index f3e9705c9..199406886 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -131,13 +131,12 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress
 	return c
 }
 
-func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	c.mu.Lock()
 	c.packets = append(c.packets, packetInfo{
-		addr:       remoteLinkAddr,
-		proto:      proto,
-		vv:         vv.Clone(nil),
-		linkHeader: linkHeader,
+		addr:  remoteLinkAddr,
+		proto: proto,
+		vv:    pkt.Data.Clone(nil),
 	})
 	c.mu.Unlock()
 
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 39757ea2a..d71a03cd2 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -116,19 +116,19 @@ func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack
 // DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
 // called by the link-layer endpoint being wrapped when a packet arrives, and
 // logs the packet before forwarding to the actual dispatcher.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
-		logPacket("recv", protocol, vv.First(), nil)
+		logPacket("recv", protocol, pkt.Data.First(), nil)
 	}
 	if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
-		vs := vv.Views()
-		length := vv.Size()
+		vs := pkt.Data.Views()
+		length := pkt.Data.Size()
 		if length > int(e.maxPCAPLen) {
 			length = int(e.maxPCAPLen)
 		}
 
 		buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
-		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(vv.Size()))); err != nil {
+		if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(pkt.Data.Size()))); err != nil {
 			panic(err)
 		}
 		for _, v := range vs {
@@ -147,7 +147,7 @@ func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local
 			panic(err)
 		}
 	}
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv, linkHeader)
+	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
 }
 
 // Attach implements the stack.LinkEndpoint interface. It saves the dispatcher
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index a04fc1062..b440970e0 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -50,12 +50,12 @@ func New(lower stack.LinkEndpoint) *Endpoint {
 // It is called by the link-layer endpoint being wrapped when a packet arrives,
 // and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
 // been called.
-func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	if !e.dispatchGate.Enter() {
 		return
 	}
 
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv, linkHeader)
+	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
 	e.dispatchGate.Leave()
 }
 
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 5f0f8fa2d..df2e70e54 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -35,7 +35,7 @@ type countedEndpoint struct {
 	dispatcher stack.NetworkDispatcher
 }
 
-func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	e.dispatchCount++
 }
 
@@ -120,21 +120,21 @@ func TestWaitDispatch(t *testing.T) {
 	}
 
 	// Dispatch and check that it goes through.
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
 	if want := 1; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on writes, then try to dispatch. It must go through.
 	wep.WaitWrite()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on dispatches, then try to dispatch. It must not go through.
 	wep.WaitDispatch()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
+	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, tcpip.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 46178459e..4161ebf87 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -92,8 +92,8 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.Vect
 	return tcpip.ErrNotSupported
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
-	v := vv.First()
+func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+	v := pkt.Data.First()
 	h := header.ARP(v)
 	if !h.IsValid() {
 		return
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 88b57ec03..47098bfdc 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -102,7 +102,9 @@ func TestDirectRequest(t *testing.T) {
 
 	inject := func(addr tcpip.Address) {
 		copy(h.ProtocolAddressTarget(), addr)
-		c.linkEP.Inject(arp.ProtocolNumber, v.ToVectorisedView())
+		c.linkEP.InjectInbound(arp.ProtocolNumber, tcpip.PacketBuffer{
+			Data: v.ToVectorisedView(),
+		})
 	}
 
 	for i, address := range []tcpip.Address{stackAddr1, stackAddr2} {
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 666d8b92a..fe499d47e 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -96,16 +96,16 @@ func (t *testObject) checkValues(protocol tcpip.TransportProtocolNumber, vv buff
 // DeliverTransportPacket is called by network endpoints after parsing incoming
 // packets. This is used by the test object to verify that the results of the
 // parsing are expected.
-func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) {
-	t.checkValues(protocol, vv, r.RemoteAddress, r.LocalAddress)
+func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) {
+	t.checkValues(protocol, pkt.Data, r.RemoteAddress, r.LocalAddress)
 	t.dataCalls++
 }
 
 // DeliverTransportControlPacket is called by network endpoints after parsing
 // incoming control (ICMP) packets. This is used by the test object to verify
 // that the results of the parsing are expected.
-func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
-	t.checkValues(trans, vv, remote, local)
+func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+	t.checkValues(trans, pkt.Data, remote, local)
 	if typ != t.typ {
 		t.t.Errorf("typ = %v, want %v", typ, t.typ)
 	}
@@ -279,7 +279,9 @@ func TestIPv4Receive(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	ep.HandlePacket(&r, view.ToVectorisedView())
+	ep.HandlePacket(&r, tcpip.PacketBuffer{
+		Data: view.ToVectorisedView(),
+	})
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -367,7 +369,9 @@ func TestIPv4ReceiveControl(t *testing.T) {
 			o.extra = c.expectedExtra
 
 			vv := view[:len(view)-c.trunc].ToVectorisedView()
-			ep.HandlePacket(&r, vv)
+			ep.HandlePacket(&r, tcpip.PacketBuffer{
+				Data: vv,
+			})
 			if want := c.expectedCount; o.controlCalls != want {
 				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
 			}
@@ -430,13 +434,17 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 	}
 
 	// Send first segment.
-	ep.HandlePacket(&r, frag1.ToVectorisedView())
+	ep.HandlePacket(&r, tcpip.PacketBuffer{
+		Data: frag1.ToVectorisedView(),
+	})
 	if o.dataCalls != 0 {
 		t.Fatalf("Bad number of data calls: got %x, want 0", o.dataCalls)
 	}
 
 	// Send second segment.
-	ep.HandlePacket(&r, frag2.ToVectorisedView())
+	ep.HandlePacket(&r, tcpip.PacketBuffer{
+		Data: frag2.ToVectorisedView(),
+	})
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -509,7 +517,9 @@ func TestIPv6Receive(t *testing.T) {
 		t.Fatalf("could not find route: %v", err)
 	}
 
-	ep.HandlePacket(&r, view.ToVectorisedView())
+	ep.HandlePacket(&r, tcpip.PacketBuffer{
+		Data: view.ToVectorisedView(),
+	})
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -618,12 +628,12 @@ func TestIPv6ReceiveControl(t *testing.T) {
 			o.typ = c.expectedTyp
 			o.extra = c.expectedExtra
 
-			vv := view[:len(view)-c.trunc].ToVectorisedView()
-
 			// Set ICMPv6 checksum.
 			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIpv6Addr, buffer.VectorisedView{}))
 
-			ep.HandlePacket(&r, vv)
+			ep.HandlePacket(&r, tcpip.PacketBuffer{
+				Data: view[:len(view)-c.trunc].ToVectorisedView(),
+			})
 			if want := c.expectedCount; o.controlCalls != want {
 				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
 			}
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 50b363dc4..ce771631c 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -15,6 +15,7 @@
 package ipv4
 
 import (
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -24,8 +25,8 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
-	h := header.IPv4(vv.First())
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+	h := header.IPv4(pkt.Data.First())
 
 	// We don't use IsValid() here because ICMP only requires that the IP
 	// header plus 8 bytes of the transport header be included. So it's
@@ -39,7 +40,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 	}
 
 	hlen := int(h.HeaderLength())
-	if vv.Size() < hlen || h.FragmentOffset() != 0 {
+	if pkt.Data.Size() < hlen || h.FragmentOffset() != 0 {
 		// We won't be able to handle this if it doesn't contain the
 		// full IPv4 header, or if it's a fragment not at offset 0
 		// (because it won't have the transport header).
@@ -47,15 +48,15 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 	}
 
 	// Skip the ip header, then deliver control message.
-	vv.TrimFront(hlen)
+	pkt.Data.TrimFront(hlen)
 	p := h.TransportProtocol()
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, vv)
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) {
+func (e *endpoint) handleICMP(r *stack.Route, pkt tcpip.PacketBuffer) {
 	stats := r.Stats()
 	received := stats.ICMP.V4PacketsReceived
-	v := vv.First()
+	v := pkt.Data.First()
 	if len(v) < header.ICMPv4MinimumSize {
 		received.Invalid.Increment()
 		return
@@ -73,20 +74,23 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		// checksum. We'll have to reset this before we hand the packet
 		// off.
 		h.SetChecksum(0)
-		gotChecksum := ^header.ChecksumVV(vv, 0 /* initial */)
+		gotChecksum := ^header.ChecksumVV(pkt.Data, 0 /* initial */)
 		if gotChecksum != wantChecksum {
 			// It's possible that a raw socket expects to receive this.
 			h.SetChecksum(wantChecksum)
-			e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv)
+			e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, pkt)
 			received.Invalid.Increment()
 			return
 		}
 
 		// It's possible that a raw socket expects to receive this.
 		h.SetChecksum(wantChecksum)
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv)
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, tcpip.PacketBuffer{
+			Data:          pkt.Data.Clone(nil),
+			NetworkHeader: append(buffer.View(nil), pkt.NetworkHeader...),
+		})
 
-		vv := vv.Clone(nil)
+		vv := pkt.Data.Clone(nil)
 		vv.TrimFront(header.ICMPv4MinimumSize)
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize)
 		pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
@@ -104,19 +108,19 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 	case header.ICMPv4EchoReply:
 		received.EchoReply.Increment()
 
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv)
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, pkt)
 
 	case header.ICMPv4DstUnreachable:
 		received.DstUnreachable.Increment()
 
-		vv.TrimFront(header.ICMPv4MinimumSize)
+		pkt.Data.TrimFront(header.ICMPv4MinimumSize)
 		switch h.Code() {
 		case header.ICMPv4PortUnreachable:
-			e.handleControl(stack.ControlPortUnreachable, 0, vv)
+			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
 
 		case header.ICMPv4FragmentationNeeded:
 			mtu := uint32(h.MTU())
-			e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv)
+			e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
 		}
 
 	case header.ICMPv4SrcQuench:
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 1339f8474..26f1402ed 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -198,7 +198,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, hdr buff
 	return nil
 }
 
-func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) {
+func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) header.IPv4 {
 	ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
 	length := uint16(hdr.UsedLength() + payloadSize)
 	id := uint32(0)
@@ -218,19 +218,24 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 		DstAddr:     r.RemoteAddress,
 	})
 	ip.SetChecksum(^ip.CalculateChecksum())
+	return ip
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
-	e.addIPHeader(r, &hdr, payload.Size(), params)
+	ip := e.addIPHeader(r, &hdr, payload.Size(), params)
 
 	if loop&stack.PacketLoop != 0 {
 		views := make([]buffer.View, 1, 1+len(payload.Views()))
 		views[0] = hdr.View()
 		views = append(views, payload.Views()...)
-		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
 		loopedR := r.MakeLoopedRoute()
-		e.HandlePacket(&loopedR, vv)
+
+		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
+			Data:          buffer.NewVectorisedView(len(views[0])+payload.Size(), views),
+			NetworkHeader: buffer.View(ip),
+		})
+
 		loopedR.Release()
 	}
 	if loop&stack.PacketOut == 0 {
@@ -301,7 +306,10 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.Vect
 	ip.SetChecksum(^ip.CalculateChecksum())
 
 	if loop&stack.PacketLoop != 0 {
-		e.HandlePacket(r, payload)
+		e.HandlePacket(r, tcpip.PacketBuffer{
+			Data:          payload,
+			NetworkHeader: buffer.View(ip),
+		})
 	}
 	if loop&stack.PacketOut == 0 {
 		return nil
@@ -314,22 +322,23 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.Vect
 
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
-	headerView := vv.First()
+func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+	headerView := pkt.Data.First()
 	h := header.IPv4(headerView)
-	if !h.IsValid(vv.Size()) {
+	if !h.IsValid(pkt.Data.Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
+	pkt.NetworkHeader = headerView[:h.HeaderLength()]
 
 	hlen := int(h.HeaderLength())
 	tlen := int(h.TotalLength())
-	vv.TrimFront(hlen)
-	vv.CapLength(tlen - hlen)
+	pkt.Data.TrimFront(hlen)
+	pkt.Data.CapLength(tlen - hlen)
 
 	more := (h.Flags() & header.IPv4FlagMoreFragments) != 0
 	if more || h.FragmentOffset() != 0 {
-		if vv.Size() == 0 {
+		if pkt.Data.Size() == 0 {
 			// Drop the packet as it's marked as a fragment but has
 			// no payload.
 			r.Stats().IP.MalformedPacketsReceived.Increment()
@@ -337,10 +346,10 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
 			return
 		}
 		// The packet is a fragment, let's try to reassemble it.
-		last := h.FragmentOffset() + uint16(vv.Size()) - 1
+		last := h.FragmentOffset() + uint16(pkt.Data.Size()) - 1
 		// Drop the packet if the fragmentOffset is incorrect. i.e the
-		// combination of fragmentOffset and vv.size() causes a wrap
-		// around resulting in last being less than the offset.
+		// combination of fragmentOffset and pkt.Data.size() causes a
+		// wrap around resulting in last being less than the offset.
 		if last < h.FragmentOffset() {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
 			r.Stats().IP.MalformedFragmentsReceived.Increment()
@@ -348,7 +357,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
 		}
 		var ready bool
 		var err error
-		vv, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, vv)
+		pkt.Data, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, pkt.Data)
 		if err != nil {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
 			r.Stats().IP.MalformedFragmentsReceived.Increment()
@@ -361,11 +370,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
 	p := h.TransportProtocol()
 	if p == header.ICMPv4ProtocolNumber {
 		headerView.CapLength(hlen)
-		e.handleICMP(r, headerView, vv)
+		e.handleICMP(r, pkt)
 		return
 	}
 	r.Stats().IP.PacketsDelivered.Increment()
-	e.dispatcher.DeliverTransportPacket(r, p, headerView, vv)
+	e.dispatcher.DeliverTransportPacket(r, p, pkt)
 }
 
 // Close cleans up resources associated with the endpoint.
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 99f84acd7..f100d84ee 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -464,7 +464,9 @@ func TestInvalidFragments(t *testing.T) {
 			s.CreateNIC(nicid, sniffer.New(ep))
 
 			for _, pkt := range tc.packets {
-				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}))
+				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, tcpip.PacketBuffer{
+					Data: buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}),
+				})
 			}
 
 			if got, want := s.Stats().IP.MalformedPacketsReceived.Value(), tc.wantMalformedIPPackets; got != want {
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 05e8c075b..58f8e80df 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -25,8 +25,8 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
-	h := header.IPv6(vv.First())
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
+	h := header.IPv6(pkt.Data.First())
 
 	// We don't use IsValid() here because ICMP only requires that up to
 	// 1280 bytes of the original packet be included. So it's likely that it
@@ -40,10 +40,10 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 
 	// Skip the IP header, then handle the fragmentation header if there
 	// is one.
-	vv.TrimFront(header.IPv6MinimumSize)
+	pkt.Data.TrimFront(header.IPv6MinimumSize)
 	p := h.TransportProtocol()
 	if p == header.IPv6FragmentHeader {
-		f := header.IPv6Fragment(vv.First())
+		f := header.IPv6Fragment(pkt.Data.First())
 		if !f.IsValid() || f.FragmentOffset() != 0 {
 			// We can't handle fragments that aren't at offset 0
 			// because they don't have the transport headers.
@@ -52,19 +52,19 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 
 		// Skip fragmentation header and find out the actual protocol
 		// number.
-		vv.TrimFront(header.IPv6FragmentHeaderSize)
+		pkt.Data.TrimFront(header.IPv6FragmentHeaderSize)
 		p = f.TransportProtocol()
 	}
 
 	// Deliver the control packet to the transport endpoint.
-	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, vv)
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) {
+func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt tcpip.PacketBuffer) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
 	received := stats.V6PacketsReceived
-	v := vv.First()
+	v := pkt.Data.First()
 	if len(v) < header.ICMPv6MinimumSize {
 		received.Invalid.Increment()
 		return
@@ -77,7 +77,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 	// Only the first view in vv is accounted for by h. To account for the
 	// rest of vv, a shallow copy is made and the first view is removed.
 	// This copy is used as extra payload during the checksum calculation.
-	payload := vv
+	payload := pkt.Data
 	payload.RemoveFirst()
 	if got, want := h.Checksum(), header.ICMPv6Checksum(h, iph.SourceAddress(), iph.DestinationAddress(), payload); got != want {
 		received.Invalid.Increment()
@@ -113,9 +113,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			received.Invalid.Increment()
 			return
 		}
-		vv.TrimFront(header.ICMPv6PacketTooBigMinimumSize)
+		pkt.Data.TrimFront(header.ICMPv6PacketTooBigMinimumSize)
 		mtu := h.MTU()
-		e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv)
+		e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
 
 	case header.ICMPv6DstUnreachable:
 		received.DstUnreachable.Increment()
@@ -123,10 +123,10 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			received.Invalid.Increment()
 			return
 		}
-		vv.TrimFront(header.ICMPv6DstUnreachableMinimumSize)
+		pkt.Data.TrimFront(header.ICMPv6DstUnreachableMinimumSize)
 		switch h.Code() {
 		case header.ICMPv6PortUnreachable:
-			e.handleControl(stack.ControlPortUnreachable, 0, vv)
+			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
 		}
 
 	case header.ICMPv6NeighborSolicit:
@@ -189,9 +189,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress[:]),
 		}
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()))
-		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
-		pkt.SetType(header.ICMPv6NeighborAdvert)
-		na := header.NDPNeighborAdvert(pkt.NDPPayload())
+		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+		packet.SetType(header.ICMPv6NeighborAdvert)
+		na := header.NDPNeighborAdvert(packet.NDPPayload())
 		na.SetSolicitedFlag(true)
 		na.SetOverrideFlag(true)
 		na.SetTargetAddress(targetAddr)
@@ -209,7 +209,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		r := r.Clone()
 		defer r.Release()
 		r.LocalAddress = targetAddr
-		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
 
 		// TODO(tamird/ghanan): there exists an explicit NDP option that is
 		// used to update the neighbor table with link addresses for a
@@ -285,13 +285,13 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			received.Invalid.Increment()
 			return
 		}
-		vv.TrimFront(header.ICMPv6EchoMinimumSize)
+		pkt.Data.TrimFront(header.ICMPv6EchoMinimumSize)
 		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize)
-		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
-		copy(pkt, h)
-		pkt.SetType(header.ICMPv6EchoReply)
-		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, vv))
-		if err := r.WritePacket(nil /* gso */, hdr, vv, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}); err != nil {
+		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
+		copy(packet, h)
+		packet.SetType(header.ICMPv6EchoReply)
+		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
+		if err := r.WritePacket(nil /* gso */, hdr, pkt.Data, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}); err != nil {
 			sent.Dropped.Increment()
 			return
 		}
@@ -303,7 +303,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			received.Invalid.Increment()
 			return
 		}
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv6ProtocolNumber, netHeader, vv)
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv6ProtocolNumber, pkt)
 
 	case header.ICMPv6TimeExceeded:
 		received.TimeExceeded.Increment()
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index d686f79ce..6037a1ef8 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -65,7 +65,7 @@ type stubDispatcher struct {
 	stack.TransportDispatcher
 }
 
-func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, buffer.View, buffer.VectorisedView) {
+func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, tcpip.PacketBuffer) {
 }
 
 type stubLinkAddressCache struct {
@@ -147,7 +147,9 @@ func TestICMPCounts(t *testing.T) {
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(&r, hdr.View().ToVectorisedView())
+		ep.HandlePacket(&r, tcpip.PacketBuffer{
+			Data: hdr.View().ToVectorisedView(),
+		})
 	}
 
 	for _, typ := range types {
@@ -280,7 +282,9 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.
 		views := []buffer.View{pkt.Header, pkt.Payload}
 		size := len(pkt.Header) + len(pkt.Payload)
 		vv := buffer.NewVectorisedView(size, views)
-		args.dst.InjectLinkAddr(pkt.Proto, args.dst.LinkAddress(), vv)
+		args.dst.InjectLinkAddr(pkt.Proto, args.dst.LinkAddress(), tcpip.PacketBuffer{
+			Data: vv,
+		})
 	}
 
 	if pkt.Proto != ProtocolNumber {
@@ -498,7 +502,9 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.Inject(ProtocolNumber, hdr.View().ToVectorisedView())
+				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+					Data: hdr.View().ToVectorisedView(),
+				})
 			}
 
 			stats := s.Stats().ICMP.V6PacketsReceived
@@ -673,7 +679,9 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.Inject(ProtocolNumber, hdr.View().ToVectorisedView())
+				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+					Data: hdr.View().ToVectorisedView(),
+				})
 			}
 
 			stats := s.Stats().ICMP.V6PacketsReceived
@@ -849,9 +857,9 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.Inject(ProtocolNumber,
-					buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize,
-						[]buffer.View{hdr.View(), payload}))
+				e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+					Data: buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize, []buffer.View{hdr.View(), payload}),
+				})
 			}
 
 			stats := s.Stats().ICMP.V6PacketsReceived
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 5898f8f9e..805d1739c 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -97,7 +97,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
-func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) {
+func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) header.IPv6 {
 	length := uint16(hdr.UsedLength() + payloadSize)
 	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
 	ip.Encode(&header.IPv6Fields{
@@ -108,19 +108,24 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 		SrcAddr:       r.LocalAddress,
 		DstAddr:       r.RemoteAddress,
 	})
+	return ip
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
-	e.addIPHeader(r, &hdr, payload.Size(), params)
+	ip := e.addIPHeader(r, &hdr, payload.Size(), params)
 
 	if loop&stack.PacketLoop != 0 {
 		views := make([]buffer.View, 1, 1+len(payload.Views()))
 		views[0] = hdr.View()
 		views = append(views, payload.Views()...)
-		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
 		loopedR := r.MakeLoopedRoute()
-		e.HandlePacket(&loopedR, vv)
+
+		e.HandlePacket(&loopedR, tcpip.PacketBuffer{
+			Data:          buffer.NewVectorisedView(len(views[0])+payload.Size(), views),
+			NetworkHeader: buffer.View(ip),
+		})
+
 		loopedR.Release()
 	}
 	if loop&stack.PacketOut == 0 {
@@ -160,24 +165,25 @@ func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.Vector
 
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
-	headerView := vv.First()
+func (e *endpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
+	headerView := pkt.Data.First()
 	h := header.IPv6(headerView)
-	if !h.IsValid(vv.Size()) {
+	if !h.IsValid(pkt.Data.Size()) {
 		return
 	}
 
-	vv.TrimFront(header.IPv6MinimumSize)
-	vv.CapLength(int(h.PayloadLength()))
+	pkt.NetworkHeader = headerView[:header.IPv6MinimumSize]
+	pkt.Data.TrimFront(header.IPv6MinimumSize)
+	pkt.Data.CapLength(int(h.PayloadLength()))
 
 	p := h.TransportProtocol()
 	if p == header.ICMPv6ProtocolNumber {
-		e.handleICMP(r, headerView, vv)
+		e.handleICMP(r, headerView, pkt)
 		return
 	}
 
 	r.Stats().IP.PacketsDelivered.Increment()
-	e.dispatcher.DeliverTransportPacket(r, p, headerView, vv)
+	e.dispatcher.DeliverTransportPacket(r, p, pkt)
 }
 
 // Close cleans up resources associated with the endpoint.
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index deaa9b7f3..1cbfa7278 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -55,7 +55,9 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.Inject(ProtocolNumber, hdr.View().ToVectorisedView())
+	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+		Data: hdr.View().ToVectorisedView(),
+	})
 
 	stats := s.Stats().ICMP.V6PacketsReceived
 
@@ -111,7 +113,9 @@ func testReceiveUDP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.Inject(ProtocolNumber, hdr.View().ToVectorisedView())
+	e.InjectInbound(ProtocolNumber, tcpip.PacketBuffer{
+		Data: hdr.View().ToVectorisedView(),
+	})
 
 	stat := s.Stats().UDP.PacketsReceived
 
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index 69ab7ba12..0dbce14a0 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -98,7 +98,9 @@ func TestHopLimitValidation(t *testing.T) {
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(r, hdr.View().ToVectorisedView())
+		ep.HandlePacket(r, tcpip.PacketBuffer{
+			Data: hdr.View().ToVectorisedView(),
+		})
 	}
 
 	types := []struct {
@@ -345,7 +347,9 @@ func TestRouterAdvertValidation(t *testing.T) {
 				t.Fatalf("got rxRA = %d, want = 0", got)
 			}
 
-			e.Inject(header.IPv6ProtocolNumber, hdr.View().ToVectorisedView())
+			e.InjectInbound(header.IPv6ProtocolNumber, tcpip.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
 
 			if test.expectedSuccess {
 				if got := invalid.Value(); got != 0 {
diff --git a/pkg/tcpip/packet_buffer.go b/pkg/tcpip/packet_buffer.go
new file mode 100644
index 000000000..10b04239d
--- /dev/null
+++ b/pkg/tcpip/packet_buffer.go
@@ -0,0 +1,54 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at //
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcpip
+
+import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+
+// A PacketBuffer contains all the data of a network packet.
+//
+// As a PacketBuffer traverses up the stack, it may be necessary to pass it to
+// multiple endpoints. Clone() should be called in such cases so that
+// modifications to the Data field do not affect other copies.
+//
+// +stateify savable
+type PacketBuffer struct {
+	// Data holds the payload of the packet. For inbound packets, it also
+	// holds the headers, which are consumed as the packet moves up the
+	// stack. Headers are guaranteed not to be split across views.
+	//
+	// The bytes backing Data are immutable, but Data itself may be trimmed
+	// or otherwise modified.
+	Data buffer.VectorisedView
+
+	// The bytes backing these views are immutable. Each field may be nil
+	// if either it has not been set yet or no such header exists (e.g.
+	// packets sent via loopback may not have a link header).
+	//
+	// These fields may be Views into other Views. SR dosen't support this,
+	// so deep copies are necessary in some cases.
+	LinkHeader      buffer.View
+	NetworkHeader   buffer.View
+	TransportHeader buffer.View
+}
+
+// Clone makes a copy of pk. It clones the Data field, which creates a new
+// VectorisedView but does not deep copy the underlying bytes.
+func (pk PacketBuffer) Clone() PacketBuffer {
+	return PacketBuffer{
+		Data:            pk.Data.Clone(nil),
+		LinkHeader:      pk.LinkHeader,
+		NetworkHeader:   pk.NetworkHeader,
+		TransportHeader: pk.TransportHeader,
+	}
+}
diff --git a/pkg/tcpip/packet_buffer_state.go b/pkg/tcpip/packet_buffer_state.go
new file mode 100644
index 000000000..04c4cf136
--- /dev/null
+++ b/pkg/tcpip/packet_buffer_state.go
@@ -0,0 +1,26 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcpip
+
+import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+
+// beforeSave is invoked by stateify.
+func (pk *PacketBuffer) beforeSave() {
+	// Non-Data fields may be slices of the Data field. This causes
+	// problems for SR, so during save we make each header independent.
+	pk.LinkHeader = append(buffer.View(nil), pk.LinkHeader...)
+	pk.NetworkHeader = append(buffer.View(nil), pk.NetworkHeader...)
+	pk.TransportHeader = append(buffer.View(nil), pk.TransportHeader...)
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 525a25218..cc789b5af 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -328,7 +328,9 @@ func TestDADFail(t *testing.T) {
 			// Receive a packet to simulate multiple nodes owning or
 			// attempting to own the same address.
 			hdr := test.makeBuf(addr1)
-			e.Inject(header.IPv6ProtocolNumber, hdr.View().ToVectorisedView())
+			e.InjectInbound(header.IPv6ProtocolNumber, tcpip.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
 
 			stat := test.getStat(s.Stats().ICMP.V6PacketsReceived)
 			if got := stat.Value(); got != 1 {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 12969c74e..28a28ae6e 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -723,10 +723,10 @@ func (n *NIC) leaveGroupLocked(addr tcpip.Address) *tcpip.Error {
 	return nil
 }
 
-func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, vv buffer.VectorisedView) {
+func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt tcpip.PacketBuffer) {
 	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
 	r.RemoteLinkAddress = remotelinkAddr
-	ref.ep.HandlePacket(&r, vv)
+	ref.ep.HandlePacket(&r, pkt)
 	ref.decRef()
 }
 
@@ -736,9 +736,9 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // Note that the ownership of the slice backing vv is retained by the caller.
 // This rule applies only to the slice itself, not to the items of the slice;
 // the ownership of the items is not retained by the caller.
-func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	n.stats.Rx.Packets.Increment()
-	n.stats.Rx.Bytes.IncrementBy(uint64(vv.Size()))
+	n.stats.Rx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
 
 	netProto, ok := n.stack.networkProtocols[protocol]
 	if !ok {
@@ -763,22 +763,22 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 	n.mu.RUnlock()
 	for _, ep := range packetEPs {
-		ep.HandlePacket(n.id, local, protocol, vv.Clone(nil), linkHeader)
+		ep.HandlePacket(n.id, local, protocol, pkt.Clone())
 	}
 
 	if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber {
 		n.stack.stats.IP.PacketsReceived.Increment()
 	}
 
-	if len(vv.First()) < netProto.MinimumPacketSize() {
+	if len(pkt.Data.First()) < netProto.MinimumPacketSize() {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
-	src, dst := netProto.ParseAddresses(vv.First())
+	src, dst := netProto.ParseAddresses(pkt.Data.First())
 
 	if ref := n.getRef(protocol, dst); ref != nil {
-		handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, vv)
+		handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, pkt)
 		return
 	}
 
@@ -806,20 +806,20 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		if ok {
 			r.RemoteAddress = src
 			// TODO(b/123449044): Update the source NIC as well.
-			ref.ep.HandlePacket(&r, vv)
+			ref.ep.HandlePacket(&r, pkt)
 			ref.decRef()
 		} else {
 			// n doesn't have a destination endpoint.
 			// Send the packet out of n.
-			hdr := buffer.NewPrependableFromView(vv.First())
-			vv.RemoveFirst()
+			hdr := buffer.NewPrependableFromView(pkt.Data.First())
+			pkt.Data.RemoveFirst()
 
 			// TODO(b/128629022): use route.WritePacket.
-			if err := n.linkEP.WritePacket(&r, nil /* gso */, hdr, vv, protocol); err != nil {
+			if err := n.linkEP.WritePacket(&r, nil /* gso */, hdr, pkt.Data, protocol); err != nil {
 				r.Stats().IP.OutgoingPacketErrors.Increment()
 			} else {
 				n.stats.Tx.Packets.Increment()
-				n.stats.Tx.Bytes.IncrementBy(uint64(hdr.UsedLength() + vv.Size()))
+				n.stats.Tx.Bytes.IncrementBy(uint64(hdr.UsedLength() + pkt.Data.Size()))
 			}
 		}
 		return
@@ -833,7 +833,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 
 // DeliverTransportPacket delivers the packets to the appropriate transport
 // protocol endpoint.
-func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) {
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) {
 	state, ok := n.stack.transportProtocols[protocol]
 	if !ok {
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
@@ -845,41 +845,41 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	// Raw socket packets are delivered based solely on the transport
 	// protocol number. We do not inspect the payload to ensure it's
 	// validly formed.
-	n.stack.demux.deliverRawPacket(r, protocol, netHeader, vv)
+	n.stack.demux.deliverRawPacket(r, protocol, pkt)
 
-	if len(vv.First()) < transProto.MinimumPacketSize() {
+	if len(pkt.Data.First()) < transProto.MinimumPacketSize() {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(vv.First())
+	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
 	if err != nil {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
 	id := TransportEndpointID{dstPort, r.LocalAddress, srcPort, r.RemoteAddress}
-	if n.stack.demux.deliverPacket(r, protocol, netHeader, vv, id) {
+	if n.stack.demux.deliverPacket(r, protocol, pkt, id) {
 		return
 	}
 
 	// Try to deliver to per-stack default handler.
 	if state.defaultHandler != nil {
-		if state.defaultHandler(r, id, netHeader, vv) {
+		if state.defaultHandler(r, id, pkt) {
 			return
 		}
 	}
 
 	// We could not find an appropriate destination for this packet, so
 	// deliver it to the global handler.
-	if !transProto.HandleUnknownDestinationPacket(r, id, netHeader, vv) {
+	if !transProto.HandleUnknownDestinationPacket(r, id, pkt) {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 	}
 }
 
 // DeliverTransportControlPacket delivers control packets to the appropriate
 // transport protocol endpoint.
-func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView) {
+func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer) {
 	state, ok := n.stack.transportProtocols[trans]
 	if !ok {
 		return
@@ -890,17 +890,17 @@ func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcp
 	// ICMPv4 only guarantees that 8 bytes of the transport protocol will
 	// be present in the payload. We know that the ports are within the
 	// first 8 bytes for all known transport protocols.
-	if len(vv.First()) < 8 {
+	if len(pkt.Data.First()) < 8 {
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(vv.First())
+	srcPort, dstPort, err := transProto.ParsePorts(pkt.Data.First())
 	if err != nil {
 		return
 	}
 
 	id := TransportEndpointID{srcPort, local, dstPort, remote}
-	if n.stack.demux.deliverControlPacket(n, net, trans, typ, extra, vv, id) {
+	if n.stack.demux.deliverControlPacket(n, net, trans, typ, extra, pkt, id) {
 		return
 	}
 }
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index d7c124e81..5806d294c 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -64,16 +64,15 @@ type TransportEndpoint interface {
 	UniqueID() uint64
 
 	// HandlePacket is called by the stack when new packets arrive to
-	// this transport endpoint.
+	// this transport endpoint. It sets pkt.TransportHeader.
 	//
-	// HandlePacket takes ownership of vv.
-	HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView)
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer)
 
-	// HandleControlPacket is called by the stack when new control (e.g.,
+	// HandleControlPacket is called by the stack when new control (e.g.
 	// ICMP) packets arrive to this transport endpoint.
-	//
-	// HandleControlPacket takes ownership of vv.
-	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView)
+	// HandleControlPacket takes ownership of pkt.
+	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
 
 	// Close puts the endpoint in a closed state and frees all resources
 	// associated with it. This cleanup may happen asynchronously. Wait can
@@ -99,8 +98,8 @@ type RawTransportEndpoint interface {
 	// this transport endpoint. The packet contains all data from the link
 	// layer up.
 	//
-	// HandlePacket takes ownership of packet and netHeader.
-	HandlePacket(r *Route, netHeader buffer.View, packet buffer.VectorisedView)
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(r *Route, pkt tcpip.PacketBuffer)
 }
 
 // PacketEndpoint is the interface that needs to be implemented by packet
@@ -117,8 +116,8 @@ type PacketEndpoint interface {
 	// linkHeader may have a length of 0, in which case the PacketEndpoint
 	// should construct its own ethernet header for applications.
 	//
-	// HandlePacket takes ownership of packet and linkHeader.
-	HandlePacket(nicid tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, packet buffer.VectorisedView, linkHeader buffer.View)
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(nicid tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
 }
 
 // TransportProtocol is the interface that needs to be implemented by transport
@@ -148,7 +147,9 @@ type TransportProtocol interface {
 	//
 	// The return value indicates whether the packet was well-formed (for
 	// stats purposes only).
-	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool
+	//
+	// HandleUnknownDestinationPacket takes ownership of pkt.
+	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
@@ -166,17 +167,21 @@ type TransportProtocol interface {
 // the network layer.
 type TransportDispatcher interface {
 	// DeliverTransportPacket delivers packets to the appropriate
-	// transport protocol endpoint. It also returns the network layer
-	// header for the enpoint to inspect or pass up the stack.
+	// transport protocol endpoint.
+	//
+	// pkt.NetworkHeader must be set before calling DeliverTransportPacket.
 	//
-	// DeliverTransportPacket takes ownership of vv and netHeader.
-	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView)
+	// DeliverTransportPacket takes ownership of pkt.
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer)
 
 	// DeliverTransportControlPacket delivers control packets to the
 	// appropriate transport protocol endpoint.
 	//
-	// DeliverTransportControlPacket takes ownership of vv.
-	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView)
+	// pkt.NetworkHeader must be set before calling
+	// DeliverTransportControlPacket.
+	//
+	// DeliverTransportControlPacket takes ownership of pkt.
+	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer)
 }
 
 // PacketLooping specifies where an outbound packet should be sent.
@@ -248,10 +253,10 @@ type NetworkEndpoint interface {
 	NICID() tcpip.NICID
 
 	// HandlePacket is called by the link layer when new packets arrive to
-	// this network endpoint.
+	// this network endpoint. It sets pkt.NetworkHeader.
 	//
-	// HandlePacket takes ownership of vv.
-	HandlePacket(r *Route, vv buffer.VectorisedView)
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(r *Route, pkt tcpip.PacketBuffer)
 
 	// Close is called when the endpoint is reomved from a stack.
 	Close()
@@ -294,11 +299,14 @@ type NetworkProtocol interface {
 // the data link layer.
 type NetworkDispatcher interface {
 	// DeliverNetworkPacket finds the appropriate network protocol endpoint
-	// and hands the packet over for further processing. linkHeader may have
-	// length 0 when the caller does not have ethernet data.
+	// and hands the packet over for further processing.
+	//
+	// pkt.LinkHeader may or may not be set before calling
+	// DeliverNetworkPacket. Some packets do not have link headers (e.g.
+	// packets sent via loopback), and won't have the field set.
 	//
-	// DeliverNetworkPacket takes ownership of vv and linkHeader.
-	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View)
+	// DeliverNetworkPacket takes ownership of pkt.
+	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
 }
 
 // LinkEndpointCapabilities is the type associated with the capabilities
@@ -329,7 +337,9 @@ const (
 
 // LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
 // ethernet, loopback, raw) and used by network layer protocols to send packets
-// out through the implementer's data link endpoint.
+// out through the implementer's data link endpoint. When a link header exists,
+// it sets each tcpip.PacketBuffer's LinkHeader field before passing it up the
+// stack.
 type LinkEndpoint interface {
 	// MTU is the maximum transmission unit for this endpoint. This is
 	// usually dictated by the backing physical network; when such a
@@ -395,7 +405,7 @@ type InjectableLinkEndpoint interface {
 	LinkEndpoint
 
 	// InjectInbound injects an inbound packet.
-	InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView)
+	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer)
 
 	// InjectOutbound writes a fully formed outbound packet directly to the
 	// link.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 8b141cafd..08599d765 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -51,7 +51,7 @@ const (
 
 type transportProtocolState struct {
 	proto          TransportProtocol
-	defaultHandler func(r *Route, id TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool
+	defaultHandler func(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) bool
 }
 
 // TCPProbeFunc is the expected function type for a TCP probe function to be
@@ -641,7 +641,7 @@ func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber,
 //
 // It must be called only during initialization of the stack. Changing it as the
 // stack is operating is not supported.
-func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, buffer.View, buffer.VectorisedView) bool) {
+func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, tcpip.PacketBuffer) bool) {
 	state := s.transportProtocols[p]
 	if state != nil {
 		state.defaultHandler = h
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 9dae853d0..1fac5477f 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -86,28 +86,28 @@ func (f *fakeNetworkEndpoint) ID() *stack.NetworkEndpointID {
 	return &f.id
 }
 
-func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
+func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt tcpip.PacketBuffer) {
 	// Increment the received packet count in the protocol descriptor.
 	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
 
 	// Consume the network header.
-	b := vv.First()
-	vv.TrimFront(fakeNetHeaderLen)
+	b := pkt.Data.First()
+	pkt.Data.TrimFront(fakeNetHeaderLen)
 
 	// Handle control packets.
 	if b[2] == uint8(fakeControlProtocol) {
-		nb := vv.First()
+		nb := pkt.Data.First()
 		if len(nb) < fakeNetHeaderLen {
 			return
 		}
 
-		vv.TrimFront(fakeNetHeaderLen)
-		f.dispatcher.DeliverTransportControlPacket(tcpip.Address(nb[1:2]), tcpip.Address(nb[0:1]), fakeNetNumber, tcpip.TransportProtocolNumber(nb[2]), stack.ControlPortUnreachable, 0, vv)
+		pkt.Data.TrimFront(fakeNetHeaderLen)
+		f.dispatcher.DeliverTransportControlPacket(tcpip.Address(nb[1:2]), tcpip.Address(nb[0:1]), fakeNetNumber, tcpip.TransportProtocolNumber(nb[2]), stack.ControlPortUnreachable, 0, pkt)
 		return
 	}
 
 	// Dispatch the packet to the transport protocol.
-	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), buffer.View([]byte{}), vv)
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), pkt)
 }
 
 func (f *fakeNetworkEndpoint) MaxHeaderLength() uint16 {
@@ -138,7 +138,9 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr bu
 		views[0] = hdr.View()
 		views = append(views, payload.Views()...)
 		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
-		f.HandlePacket(r, vv)
+		f.HandlePacket(r, tcpip.PacketBuffer{
+			Data: vv,
+		})
 	}
 	if loop&stack.PacketOut == 0 {
 		return nil
@@ -259,7 +261,9 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet with wrong address is not delivered.
 	buf[0] = 3
-	ep.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeNet.packetCount[1] != 0 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 0)
 	}
@@ -269,7 +273,9 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet is delivered to first endpoint.
 	buf[0] = 1
-	ep.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -279,7 +285,9 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet is delivered to second endpoint.
 	buf[0] = 2
-	ep.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -288,7 +296,9 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is not delivered if protocol number is wrong.
-	ep.Inject(fakeNetNumber-1, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber-1, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -298,7 +308,9 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet that is too small is dropped.
 	buf.CapLength(2)
-	ep.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -373,7 +385,9 @@ func testFailingRecv(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte b
 
 func testRecvInternal(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte byte, ep *channel.Endpoint, buf buffer.View, want int) {
 	t.Helper()
-	ep.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if got := fakeNet.PacketCount(localAddrByte); got != want {
 		t.Errorf("receive packet count: got = %d, want %d", got, want)
 	}
@@ -1795,7 +1809,9 @@ func TestNICStats(t *testing.T) {
 
 	// Send a packet to address 1.
 	buf := buffer.NewView(30)
-	ep1.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if got, want := s.NICInfo()[1].Stats.Rx.Packets.Value(), uint64(1); got != want {
 		t.Errorf("got Rx.Packets.Value() = %d, want = %d", got, want)
 	}
@@ -1855,7 +1871,9 @@ func TestNICForwarding(t *testing.T) {
 	// Send a packet to address 3.
 	buf := buffer.NewView(30)
 	buf[0] = 3
-	ep1.Inject(fakeNetNumber, buf.ToVectorisedView())
+	ep1.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 
 	select {
 	case <-ep2.C:
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index ccd3d030e..594570216 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -21,7 +21,6 @@ import (
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
@@ -86,7 +85,7 @@ func (epsByNic *endpointsByNic) transportEndpoints() []TransportEndpoint {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
+func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
 	epsByNic.mu.RLock()
 
 	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
@@ -100,18 +99,18 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, v
 	// If this is a broadcast or multicast datagram, deliver the datagram to all
 	// endpoints bound to the right device.
 	if isMulticastOrBroadcast(id.LocalAddress) {
-		mpep.handlePacketAll(r, id, vv)
+		mpep.handlePacketAll(r, id, pkt)
 		epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
 		return
 	}
 
 	// multiPortEndpoints are guaranteed to have at least one element.
-	selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, vv)
+	selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, pkt)
 	epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView) {
+func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt tcpip.PacketBuffer) {
 	epsByNic.mu.RLock()
 	defer epsByNic.mu.RUnlock()
 
@@ -127,7 +126,7 @@ func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpoint
 	// broadcast like we are doing with handlePacket above?
 
 	// multiPortEndpoints are guaranteed to have at least one element.
-	selectEndpoint(id, mpep, epsByNic.seed).HandleControlPacket(id, typ, extra, vv)
+	selectEndpoint(id, mpep, epsByNic.seed).HandleControlPacket(id, typ, extra, pkt)
 }
 
 // registerEndpoint returns true if it succeeds. It fails and returns
@@ -258,18 +257,16 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 	return mpep.endpointsArr[idx]
 }
 
-func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
+func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
 	ep.mu.RLock()
 	for i, endpoint := range ep.endpointsArr {
-		// HandlePacket modifies vv, so each endpoint needs its own copy except for
-		// the final one.
+		// HandlePacket takes ownership of pkt, so each endpoint needs
+		// its own copy except for the final one.
 		if i == len(ep.endpointsArr)-1 {
-			endpoint.HandlePacket(r, id, vv)
+			endpoint.HandlePacket(r, id, pkt)
 			break
 		}
-		vvCopy := buffer.NewView(vv.Size())
-		copy(vvCopy, vv.ToView())
-		endpoint.HandlePacket(r, id, vvCopy.ToVectorisedView())
+		endpoint.HandlePacket(r, id, pkt.Clone())
 	}
 	ep.mu.RUnlock() // Don't use defer for performance reasons.
 }
@@ -395,7 +392,7 @@ var loopbackSubnet = func() tcpip.Subnet {
 // deliverPacket attempts to find one or more matching transport endpoints, and
 // then, if matches are found, delivers the packet to them. Returns true if it
 // found one or more endpoints, false otherwise.
-func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -408,8 +405,8 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 	// transport endpoints.
 	var destEps []*endpointsByNic
 	if protocol == header.UDPProtocolNumber && isMulticastOrBroadcast(id.LocalAddress) {
-		destEps = d.findAllEndpointsLocked(eps, vv, id)
-	} else if ep := d.findEndpointLocked(eps, vv, id); ep != nil {
+		destEps = d.findAllEndpointsLocked(eps, id)
+	} else if ep := d.findEndpointLocked(eps, id); ep != nil {
 		destEps = append(destEps, ep)
 	}
 
@@ -424,17 +421,19 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 		return false
 	}
 
-	// Deliver the packet.
-	for _, ep := range destEps {
-		ep.handlePacket(r, id, vv)
+	// HandlePacket takes ownership of pkt, so each endpoint needs its own
+	// copy except for the final one.
+	for _, ep := range destEps[:len(destEps)-1] {
+		ep.handlePacket(r, id, pkt.Clone())
 	}
+	destEps[len(destEps)-1].handlePacket(r, id, pkt)
 
 	return true
 }
 
 // deliverRawPacket attempts to deliver the given packet and returns whether it
 // was delivered successfully.
-func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) bool {
+func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt tcpip.PacketBuffer) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -448,7 +447,7 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
 	for _, rawEP := range eps.rawEndpoints {
 		// Each endpoint gets its own copy of the packet for the sake
 		// of save/restore.
-		rawEP.HandlePacket(r, buffer.NewViewFromBytes(netHeader), vv.ToView().ToVectorisedView())
+		rawEP.HandlePacket(r, pkt)
 		foundRaw = true
 	}
 	eps.mu.RUnlock()
@@ -458,7 +457,7 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
 
 // deliverControlPacket attempts to deliver the given control packet. Returns
 // true if it found an endpoint, false otherwise.
-func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt tcpip.PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{net, trans}]
 	if !ok {
 		return false
@@ -466,7 +465,7 @@ func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtoco
 
 	// Try to find the endpoint.
 	eps.mu.RLock()
-	ep := d.findEndpointLocked(eps, vv, id)
+	ep := d.findEndpointLocked(eps, id)
 	eps.mu.RUnlock()
 
 	// Fail if we didn't find one.
@@ -475,12 +474,12 @@ func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtoco
 	}
 
 	// Deliver the packet.
-	ep.handleControlPacket(n, id, typ, extra, vv)
+	ep.handleControlPacket(n, id, typ, extra, pkt)
 
 	return true
 }
 
-func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, vv buffer.VectorisedView, id TransportEndpointID) []*endpointsByNic {
+func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, id TransportEndpointID) []*endpointsByNic {
 	var matchedEPs []*endpointsByNic
 	// Try to find a match with the id as provided.
 	if ep, ok := eps.endpoints[id]; ok {
@@ -514,8 +513,8 @@ func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, vv bu
 
 // findEndpointLocked returns the endpoint that most closely matches the given
 // id.
-func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer.VectorisedView, id TransportEndpointID) *endpointsByNic {
-	if matchedEPs := d.findAllEndpointsLocked(eps, vv, id); len(matchedEPs) > 0 {
+func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, id TransportEndpointID) *endpointsByNic {
+	if matchedEPs := d.findAllEndpointsLocked(eps, id); len(matchedEPs) > 0 {
 		return matchedEPs[0]
 	}
 	return nil
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 210233dc0..f54117c4e 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -156,7 +156,9 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpName string
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEPs[linkEpName].Inject(ipv6.ProtocolNumber, buf.ToVectorisedView())
+	c.linkEPs[linkEpName].InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 }
 
 func TestTransportDemuxerRegister(t *testing.T) {
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 203e79f56..2cacea99a 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -197,7 +197,7 @@ func (*fakeTransportEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Erro
 	return tcpip.FullAddress{}, nil
 }
 
-func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ buffer.VectorisedView) {
+func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ tcpip.PacketBuffer) {
 	// Increment the number of received packets.
 	f.proto.packetCount++
 	if f.acceptQueue != nil {
@@ -214,7 +214,7 @@ func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportE
 	}
 }
 
-func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, buffer.VectorisedView) {
+func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, tcpip.PacketBuffer) {
 	// Increment the number of received control packets.
 	f.proto.controlCount++
 }
@@ -271,7 +271,7 @@ func (*fakeTransportProtocol) ParsePorts(buffer.View) (src, dst uint16, err *tcp
 	return 0, 0, nil
 }
 
-func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, buffer.View, buffer.VectorisedView) bool {
+func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
 	return true
 }
 
@@ -342,7 +342,9 @@ func TestTransportReceive(t *testing.T) {
 	// Make sure packet with wrong protocol is not delivered.
 	buf[0] = 1
 	buf[2] = 0
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.packetCount != 0 {
 		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 0)
 	}
@@ -351,7 +353,9 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 3
 	buf[2] = byte(fakeTransNumber)
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.packetCount != 0 {
 		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 0)
 	}
@@ -360,7 +364,9 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 2
 	buf[2] = byte(fakeTransNumber)
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.packetCount != 1 {
 		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 1)
 	}
@@ -413,7 +419,9 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 0
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = 0
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.controlCount != 0 {
 		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 0)
 	}
@@ -422,7 +430,9 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 3
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.controlCount != 0 {
 		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 0)
 	}
@@ -431,7 +441,9 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 2
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.Inject(fakeNetNumber, buf.ToVectorisedView())
+	linkEP.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 	if fakeTrans.controlCount != 1 {
 		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 1)
 	}
@@ -584,7 +596,9 @@ func TestTransportForwarding(t *testing.T) {
 	req[0] = 1
 	req[1] = 3
 	req[2] = byte(fakeTransNumber)
-	ep2.Inject(fakeNetNumber, req.ToVectorisedView())
+	ep2.InjectInbound(fakeNetNumber, tcpip.PacketBuffer{
+		Data: req.ToVectorisedView(),
+	})
 
 	aep, _, err := ep.Accept()
 	if err != nil || aep == nil {
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 33405eb7d..0092d0ea9 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -718,18 +718,18 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
 	// Only accept echo replies.
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
-		h := header.ICMPv4(vv.First())
+		h := header.ICMPv4(pkt.Data.First())
 		if h.Type() != header.ICMPv4EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
 		}
 	case header.IPv6ProtocolNumber:
-		h := header.ICMPv6(vv.First())
+		h := header.ICMPv6(pkt.Data.First())
 		if h.Type() != header.ICMPv6EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
@@ -757,19 +757,19 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 	wasEmpty := e.rcvBufSize == 0
 
 	// Push new packet into receive list and increment the buffer size.
-	pkt := &icmpPacket{
+	packet := &icmpPacket{
 		senderAddress: tcpip.FullAddress{
 			NIC:  r.NICID(),
 			Addr: id.RemoteAddress,
 		},
 	}
 
-	pkt.data = vv
+	packet.data = pkt.Data
 
-	e.rcvList.PushBack(pkt)
-	e.rcvBufSize += pkt.data.Size()
+	e.rcvList.PushBack(packet)
+	e.rcvBufSize += packet.data.Size()
 
-	pkt.timestamp = e.stack.NowNanoseconds()
+	packet.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
 	e.stats.PacketsReceived.Increment()
@@ -780,7 +780,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
 }
 
 // State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index bfb16f7c3..9ce500e80 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -104,7 +104,7 @@ func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, buffer.View, buffer.VectorisedView) bool {
+func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, tcpip.PacketBuffer) bool {
 	return true
 }
 
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index ead83b83d..26335094e 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -266,7 +266,7 @@ func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // HandlePacket implements stack.PacketEndpoint.HandlePacket.
-func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, ethHeader buffer.View) {
+func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	ep.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
@@ -289,9 +289,9 @@ func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress,
 	// Push new packet into receive list and increment the buffer size.
 	var packet packet
 	// TODO(b/129292371): Return network protocol.
-	if len(ethHeader) > 0 {
+	if len(pkt.LinkHeader) > 0 {
 		// Get info directly from the ethernet header.
-		hdr := header.Ethernet(ethHeader)
+		hdr := header.Ethernet(pkt.LinkHeader)
 		packet.senderAddr = tcpip.FullAddress{
 			NIC:  nicid,
 			Addr: tcpip.Address(hdr.SourceAddress()),
@@ -306,11 +306,12 @@ func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress,
 
 	if ep.cooked {
 		// Cooked packets can simply be queued.
-		packet.data = vv
+		packet.data = pkt.Data
 	} else {
 		// Raw packets need their ethernet headers prepended before
 		// queueing.
-		if len(ethHeader) == 0 {
+		var linkHeader buffer.View
+		if len(pkt.LinkHeader) == 0 {
 			// We weren't provided with an actual ethernet header,
 			// so fake one.
 			ethFields := header.EthernetFields{
@@ -320,10 +321,12 @@ func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress,
 			}
 			fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
 			fakeHeader.Encode(&ethFields)
-			ethHeader = buffer.View(fakeHeader)
+			linkHeader = buffer.View(fakeHeader)
+		} else {
+			linkHeader = append(buffer.View(nil), pkt.LinkHeader...)
 		}
-		combinedVV := buffer.View(ethHeader).ToVectorisedView()
-		combinedVV.Append(vv)
+		combinedVV := linkHeader.ToVectorisedView()
+		combinedVV.Append(pkt.Data)
 		packet.data = combinedVV
 	}
 	packet.timestampNS = ep.stack.NowNanoseconds()
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 23922a30e..230a1537a 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -555,7 +555,7 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
-func (e *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) {
+func (e *endpoint) HandlePacket(route *stack.Route, pkt tcpip.PacketBuffer) {
 	e.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
@@ -596,20 +596,21 @@ func (e *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv bu
 	wasEmpty := e.rcvBufSize == 0
 
 	// Push new packet into receive list and increment the buffer size.
-	pkt := &rawPacket{
+	packet := &rawPacket{
 		senderAddr: tcpip.FullAddress{
 			NIC:  route.NICID(),
 			Addr: route.RemoteAddress,
 		},
 	}
 
-	combinedVV := netHeader.ToVectorisedView()
-	combinedVV.Append(vv)
-	pkt.data = combinedVV
-	pkt.timestampNS = e.stack.NowNanoseconds()
+	networkHeader := append(buffer.View(nil), pkt.NetworkHeader...)
+	combinedVV := networkHeader.ToVectorisedView()
+	combinedVV.Append(pkt.Data)
+	packet.data = combinedVV
+	packet.timestampNS = e.stack.NowNanoseconds()
 
-	e.rcvList.PushBack(pkt)
-	e.rcvBufSize += pkt.data.Size()
+	e.rcvList.PushBack(packet)
+	e.rcvBufSize += packet.data.Size()
 
 	e.rcvMu.Unlock()
 	e.stats.PacketsReceived.Increment()
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index a1efd8d55..e31464c9b 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2029,8 +2029,8 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
-	s := newSegment(r, id, vv)
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+	s := newSegment(r, id, pkt)
 	if !s.parse() {
 		e.stack.Stats().MalformedRcvdPackets.Increment()
 		e.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
@@ -2065,7 +2065,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
 	switch typ {
 	case stack.ControlPacketTooBig:
 		e.sndBufMu.Lock()
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 63666f0b3..4983bca81 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -18,7 +18,6 @@ import (
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -63,8 +62,8 @@ func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*Forward
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool {
-	s := newSegment(r, id, vv)
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
 	// We only care about well-formed SYN packets.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index db40785d3..c4f1a84bb 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -126,8 +126,8 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 // a reset is sent in response to any incoming segment except another reset. In
 // particular, SYNs addressed to a non-existent connection are rejected by this
 // means."
-func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool {
-	s := newSegment(r, id, vv)
+func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
+	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
 	if !s.parse() || !s.csumValid {
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index c4a89525e..1c10da5ca 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -18,6 +18,7 @@ import (
 	"sync/atomic"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
@@ -60,13 +61,13 @@ type segment struct {
 	xmitTime time.Time `state:".(unixTime)"`
 }
 
-func newSegment(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) *segment {
+func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) *segment {
 	s := &segment{
 		refCnt: 1,
 		id:     id,
 		route:  r.Clone(),
 	}
-	s.data = vv.Clone(s.views[:])
+	s.data = pkt.Data.Clone(s.views[:])
 	s.rcvdTime = time.Now()
 	return s
 }
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index ef823e4ae..4854e719d 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -302,7 +302,9 @@ func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byt
 	copy(icmp[header.ICMPv4PayloadOffset:], p2)
 
 	// Inject packet.
-	c.linkEP.Inject(ipv4.ProtocolNumber, buf.ToVectorisedView())
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 }
 
 // BuildSegment builds a TCP segment based on the given Headers and payload.
@@ -350,13 +352,17 @@ func (c *Context) BuildSegment(payload []byte, h *Headers) buffer.VectorisedView
 // SendSegment sends a TCP segment that has already been built and written to a
 // buffer.VectorisedView.
 func (c *Context) SendSegment(s buffer.VectorisedView) {
-	c.linkEP.Inject(ipv4.ProtocolNumber, s)
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+		Data: s,
+	})
 }
 
 // SendPacket builds and sends a TCP segment(with the provided payload & TCP
 // headers) in an IPv4 packet via the link layer endpoint.
 func (c *Context) SendPacket(payload []byte, h *Headers) {
-	c.linkEP.Inject(ipv4.ProtocolNumber, c.BuildSegment(payload, h))
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+		Data: c.BuildSegment(payload, h),
+	})
 }
 
 // SendAck sends an ACK packet.
@@ -518,7 +524,9 @@ func (c *Context) SendV6Packet(payload []byte, h *Headers) {
 	t.SetChecksum(^t.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.Inject(ipv6.ProtocolNumber, buf.ToVectorisedView())
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
 }
 
 // CreateConnected creates a connected TCP endpoint.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 03bd5c8fd..4e11de9db 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1158,17 +1158,17 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
 	// Get the header then trim it from the view.
-	hdr := header.UDP(vv.First())
-	if int(hdr.Length()) > vv.Size() {
+	hdr := header.UDP(pkt.Data.First())
+	if int(hdr.Length()) > pkt.Data.Size() {
 		// Malformed packet.
 		e.stack.Stats().UDP.MalformedPacketsReceived.Increment()
 		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 		return
 	}
 
-	vv.TrimFront(header.UDPMinimumSize)
+	pkt.Data.TrimFront(header.UDPMinimumSize)
 
 	e.rcvMu.Lock()
 	e.stack.Stats().UDP.PacketsReceived.Increment()
@@ -1192,18 +1192,18 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 	wasEmpty := e.rcvBufSize == 0
 
 	// Push new packet into receive list and increment the buffer size.
-	pkt := &udpPacket{
+	packet := &udpPacket{
 		senderAddress: tcpip.FullAddress{
 			NIC:  r.NICID(),
 			Addr: id.RemoteAddress,
 			Port: hdr.SourcePort(),
 		},
 	}
-	pkt.data = vv
-	e.rcvList.PushBack(pkt)
-	e.rcvBufSize += vv.Size()
+	packet.data = pkt.Data
+	e.rcvList.PushBack(packet)
+	e.rcvBufSize += pkt.Data.Size()
 
-	pkt.timestamp = e.stack.NowNanoseconds()
+	packet.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
 
@@ -1214,7 +1214,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt tcpip.PacketBuffer) {
 }
 
 // State implements tcpip.Endpoint.State.
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index d399ec722..fc706ede2 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -16,7 +16,6 @@ package udp
 
 import (
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -44,12 +43,12 @@ func NewForwarder(s *stack.Stack, handler func(*ForwarderRequest)) *Forwarder {
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
 	f.handler(&ForwarderRequest{
 		stack: f.stack,
 		route: r,
 		id:    id,
-		vv:    vv,
+		pkt:   pkt,
 	})
 
 	return true
@@ -62,7 +61,7 @@ type ForwarderRequest struct {
 	stack *stack.Stack
 	route *stack.Route
 	id    stack.TransportEndpointID
-	vv    buffer.VectorisedView
+	pkt   tcpip.PacketBuffer
 }
 
 // ID returns the 4-tuple (src address, src port, dst address, dst port) that
@@ -90,7 +89,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 	ep.rcvReady = true
 	ep.rcvMu.Unlock()
 
-	ep.HandlePacket(r.route, r.id, r.vv)
+	ep.HandlePacket(r.route, r.id, r.pkt)
 
 	return ep, nil
 }
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 5c3358a5e..43f11b700 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -66,10 +66,10 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool {
+func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) bool {
 	// Get the header then trim it from the view.
-	hdr := header.UDP(vv.First())
-	if int(hdr.Length()) > vv.Size() {
+	hdr := header.UDP(pkt.Data.First())
+	if int(hdr.Length()) > pkt.Data.Size() {
 		// Malformed packet.
 		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
 		return true
@@ -116,20 +116,18 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		}
 		headerLen := int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize
 		available := int(mtu) - headerLen
-		payloadLen := len(netHeader) + vv.Size()
+		payloadLen := len(pkt.NetworkHeader) + pkt.Data.Size()
 		if payloadLen > available {
 			payloadLen = available
 		}
 
-		// The buffers used by vv and netHeader may be used elsewhere
-		// in the system.  For example, a raw or packet socket may use
-		// what UDP considers an unreachable destination. Thus we deep
-		// copy vv and netHeader to prevent multiple ownership and SR
-		// errors.
-		newNetHeader := make(buffer.View, len(netHeader))
-		copy(newNetHeader, netHeader)
-		payload := buffer.NewVectorisedView(len(newNetHeader), []buffer.View{newNetHeader})
-		payload.Append(vv.ToView().ToVectorisedView())
+		// The buffers used by pkt may be used elsewhere in the system.
+		// For example, a raw or packet socket may use what UDP
+		// considers an unreachable destination. Thus we deep copy pkt
+		// to prevent multiple ownership and SR errors.
+		newNetHeader := append(buffer.View(nil), pkt.NetworkHeader...)
+		payload := newNetHeader.ToVectorisedView()
+		payload.Append(pkt.Data.ToView().ToVectorisedView())
 		payload.CapLength(payloadLen)
 
 		hdr := buffer.NewPrependable(headerLen)
@@ -158,12 +156,12 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		}
 		headerLen := int(r.MaxHeaderLength()) + header.ICMPv6DstUnreachableMinimumSize
 		available := int(mtu) - headerLen
-		payloadLen := len(netHeader) + vv.Size()
+		payloadLen := len(pkt.NetworkHeader) + pkt.Data.Size()
 		if payloadLen > available {
 			payloadLen = available
 		}
-		payload := buffer.NewVectorisedView(len(netHeader), []buffer.View{netHeader})
-		payload.Append(vv)
+		payload := buffer.NewVectorisedView(len(pkt.NetworkHeader), []buffer.View{pkt.NetworkHeader})
+		payload.Append(pkt.Data)
 		payload.CapLength(payloadLen)
 
 		hdr := buffer.NewPrependable(headerLen)
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index b724d788c..30ee9801b 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -397,7 +397,8 @@ func (c *testContext) injectPacket(flow testFlow, payload []byte) {
 func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool) {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.UDPMinimumSize + header.IPv6MinimumSize + len(payload))
-	copy(buf[len(buf)-len(payload):], payload)
+	payloadStart := len(buf) - len(payload)
+	copy(buf[payloadStart:], payload)
 
 	// Initialize the IP header.
 	ip := header.IPv6(buf)
@@ -431,7 +432,11 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.Inject(ipv6.ProtocolNumber, buf.ToVectorisedView())
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
+	})
 }
 
 // injectV4Packet creates a V4 test packet with the given payload and header
@@ -441,7 +446,8 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool) {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.UDPMinimumSize + header.IPv4MinimumSize + len(payload))
-	copy(buf[len(buf)-len(payload):], payload)
+	payloadStart := len(buf) - len(payload)
+	copy(buf[payloadStart:], payload)
 
 	// Initialize the IP header.
 	ip := header.IPv4(buf)
@@ -471,7 +477,12 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.Inject(ipv4.ProtocolNumber, buf.ToVectorisedView())
+
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, tcpip.PacketBuffer{
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
+	})
 }
 
 func newPayload() []byte {
diff --git a/test/syscalls/linux/raw_socket_icmp.cc b/test/syscalls/linux/raw_socket_icmp.cc
index 8bcaba6f1..3de898df7 100644
--- a/test/syscalls/linux/raw_socket_icmp.cc
+++ b/test/syscalls/linux/raw_socket_icmp.cc
@@ -129,7 +129,7 @@ TEST_F(RawSocketICMPTest, SendAndReceiveBadChecksum) {
   EXPECT_THAT(RetryEINTR(recv)(s_, recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
-//
+
 // Send and receive an ICMP packet.
 TEST_F(RawSocketICMPTest, SendAndReceive) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-- 
cgit v1.2.3


From 66ebb6575f929a389d3c929977ed5e31d706fcfe Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 7 Nov 2019 09:45:26 -0800
Subject: Add support for TIME_WAIT timeout.

This change adds explicit support for honoring the 2MSL timeout
for sockets in TIME_WAIT state. It also adds support for the
TCP_LINGER2 option that allows modification of the FIN_WAIT2
state timeout duration for a given socket.

It also adds an option to modify the Stack wide TIME_WAIT timeout
but this is only for testing. On Linux this is fixed at 60s.

Further, we also now correctly process RST's in CLOSE_WAIT and
close the socket similar to linux without moving it to error
state.

We also now handle SYN in ESTABLISHED state as per
RFC5961#section-4.1. Earlier we would just drop these SYNs.
Which can result in some tests that pass on linux to fail on
gVisor.

Netstack now honors TIME_WAIT correctly as well as handles the
following cases correctly.

- TCP RSTs in TIME_WAIT are ignored.
- A duplicate TCP FIN during TIME_WAIT extends the TIME_WAIT
  and a dup ACK is sent in response to the FIN as the dup FIN
  indicates potential loss of the original final ACK.
- An out of order segment during TIME_WAIT generates a dup ACK.
- A new SYN w/ a sequence number > the highest sequence number
  in the previous connection closes the TIME_WAIT early and
  opens a new connection.

Further to make the SYN case work correctly the ISN (Initial
Sequence Number) generation for Netstack has been updated to
be as per RFC. Its not a pure random number anymore and follows
the recommendation in https://tools.ietf.org/html/rfc6528#page-3.

The current hash used is not a cryptographically secure hash
function. A separate change will update the hash function used
to Siphash similar to what is used in Linux.

PiperOrigin-RevId: 279106406
---
 pkg/sentry/socket/netstack/netstack.go       |  20 +
 pkg/tcpip/adapters/gonet/gonet_test.go       |  12 +-
 pkg/tcpip/stack/stack.go                     |  20 +-
 pkg/tcpip/stack/transport_demuxer.go         |  33 +-
 pkg/tcpip/tcpip.go                           |  12 +-
 pkg/tcpip/transport/tcp/BUILD                |   2 +-
 pkg/tcpip/transport/tcp/accept.go            |  17 +-
 pkg/tcpip/transport/tcp/connect.go           | 322 ++++++++++++--
 pkg/tcpip/transport/tcp/endpoint.go          | 101 ++++-
 pkg/tcpip/transport/tcp/endpoint_state.go    |  26 +-
 pkg/tcpip/transport/tcp/protocol.go          |  43 ++
 pkg/tcpip/transport/tcp/rcv.go               | 167 ++++++-
 pkg/tcpip/transport/tcp/tcp_test.go          | 622 ++++++++++++++++++++++++++-
 test/syscalls/BUILD                          |  22 +-
 test/syscalls/linux/BUILD                    |   1 +
 test/syscalls/linux/socket_inet_loopback.cc  | 336 +++++++++++++++
 test/syscalls/linux/socket_ip_tcp_generic.cc |  93 ++++
 17 files changed, 1736 insertions(+), 113 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 27c6692c4..d92399efd 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1173,6 +1173,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		copy(b, v)
 		return b, nil
 
+	case linux.TCP_LINGER2:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.TCPLingerTimeoutOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Second), nil
+
 	default:
 		emitUnimplementedEventTCP(t, name)
 	}
@@ -1556,6 +1568,14 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 		return nil
 
+	case linux.TCP_LINGER2:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
+
 	case linux.TCP_REPAIR_OPTIONS:
 		t.Kernel().EmitUnimplementedEvent(t)
 
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index 8ced960bb..ee077ae83 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -151,10 +151,8 @@ func TestCloseReader(t *testing.T) {
 
 		buf := make([]byte, 256)
 		n, err := c.Read(buf)
-		got, ok := err.(*net.OpError)
-		want := tcpip.ErrConnectionAborted
-		if n != 0 || !ok || got.Err.Error() != want.String() {
-			t.Errorf("c.Read() = (%d, %v), want (0, OpError(%v))", n, err, want)
+		if n != 0 || err != io.EOF {
+			t.Errorf("c.Read() = (%d, %v), want (0, EOF)", n, err)
 		}
 	}()
 	sender, err := connect(s, addr)
@@ -203,10 +201,8 @@ func TestCloseReaderWithForwarder(t *testing.T) {
 
 		buf := make([]byte, 256)
 		n, e := c.Read(buf)
-		got, ok := e.(*net.OpError)
-		want := tcpip.ErrConnectionAborted
-		if n != 0 || !ok || got.Err.Error() != want.String() {
-			t.Errorf("c.Read() = (%d, %v), want (0, OpError(%v))", n, e, want)
+		if n != 0 || e != io.EOF {
+			t.Errorf("c.Read() = (%d, %v), want (0, EOF)", n, e)
 		}
 	})
 	s.SetTransportProtocolHandler(tcp.ProtocolNumber, fwd.HandlePacket)
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 99809df75..2f8d8e822 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -402,11 +402,11 @@ type Stack struct {
 	// by the stack.
 	icmpRateLimiter *ICMPRateLimiter
 
-	// portSeed is a one-time random value initialized at stack startup
+	// seed is a one-time random value initialized at stack startup
 	// and is used to seed the TCP port picking on active connections
 	//
 	// TODO(gvisor.dev/issue/940): S/R this field.
-	portSeed uint32
+	seed uint32
 
 	// ndpConfigs is the default NDP configurations used by interfaces.
 	ndpConfigs NDPConfigurations
@@ -544,7 +544,7 @@ func New(opts Options) *Stack {
 		stats:                opts.Stats.FillIn(),
 		handleLocal:          opts.HandleLocal,
 		icmpRateLimiter:      NewICMPRateLimiter(),
-		portSeed:             generateRandUint32(),
+		seed:                 generateRandUint32(),
 		ndpConfigs:           opts.NDPConfigs,
 		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
 		uniqueIDGenerator:    opts.UniqueID,
@@ -1186,6 +1186,12 @@ func (s *Stack) CompleteTransportEndpointCleanup(ep TransportEndpoint) {
 	s.mu.Unlock()
 }
 
+// FindTransportEndpoint finds an endpoint that most closely matches the provided
+// id. If no endpoint is found it returns nil.
+func (s *Stack) FindTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint {
+	return s.demux.findTransportEndpoint(netProto, transProto, id, r)
+}
+
 // RegisterRawTransportEndpoint registers the given endpoint with the stack
 // transport dispatcher. Received packets that match the provided transport
 // protocol will be delivered to the given endpoint.
@@ -1573,12 +1579,12 @@ func (s *Stack) HandleNDPRA(id tcpip.NICID, ip tcpip.Address, ra header.NDPRoute
 	return nil
 }
 
-// PortSeed returns a 32 bit value that can be used as a seed value for port
-// picking.
+// Seed returns a 32 bit value that can be used as a seed value for port
+// picking, ISN generation etc.
 //
 // NOTE: The seed is generated once during stack initialization only.
-func (s *Stack) PortSeed() uint32 {
-	return s.portSeed
+func (s *Stack) Seed() uint32 {
+	return s.seed
 }
 
 func generateRandUint32() uint32 {
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 594570216..cb805522b 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -103,7 +103,6 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, p
 		epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
 		return
 	}
-
 	// multiPortEndpoints are guaranteed to have at least one element.
 	selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, pkt)
 	epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
@@ -507,10 +506,40 @@ func (d *transportDemuxer) findAllEndpointsLocked(eps *transportEndpoints, id Tr
 	if ep, ok := eps.endpoints[nid]; ok {
 		matchedEPs = append(matchedEPs, ep)
 	}
-
 	return matchedEPs
 }
 
+// findTransportEndpoint find a single endpoint that most closely matches the provided id.
+func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint {
+	eps, ok := d.protocol[protocolIDs{netProto, transProto}]
+	if !ok {
+		return nil
+	}
+	// Try to find the endpoint.
+	eps.mu.RLock()
+	epsByNic := d.findEndpointLocked(eps, id)
+	// Fail if we didn't find one.
+	if epsByNic == nil {
+		eps.mu.RUnlock()
+		return nil
+	}
+
+	epsByNic.mu.RLock()
+	eps.mu.RUnlock()
+
+	mpep, ok := epsByNic.endpoints[r.ref.nic.ID()]
+	if !ok {
+		if mpep, ok = epsByNic.endpoints[0]; !ok {
+			epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
+			return nil
+		}
+	}
+
+	ep := selectEndpoint(id, mpep, epsByNic.seed)
+	epsByNic.mu.RUnlock()
+	return ep
+}
+
 // findEndpointLocked returns the endpoint that most closely matches the given
 // id.
 func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, id TransportEndpointID) *endpointsByNic {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 3edb513d4..bd5eb89ca 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -586,6 +586,16 @@ type MaxSegOption int
 // A zero value indicates the default.
 type TTLOption uint8
 
+// TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
+// maximum duration for which a socket lingers in the TCP_FIN_WAIT_2 state
+// before being marked closed.
+type TCPLingerTimeoutOption time.Duration
+
+// TCPTimeWaitTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
+// maximum duration for which a socket lingers in the TIME_WAIT state
+// before being marked closed.
+type TCPTimeWaitTimeoutOption time.Duration
+
 // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
 // TTL value for multicast messages. The default is 1.
 type MulticastTTLOption uint8
@@ -1329,8 +1339,8 @@ var (
 
 // GetDanglingEndpoints returns all dangling endpoints.
 func GetDanglingEndpoints() []Endpoint {
-	es := make([]Endpoint, 0, len(danglingEndpoints))
 	danglingEndpointsMu.Lock()
+	es := make([]Endpoint, 0, len(danglingEndpoints))
 	for e := range danglingEndpoints {
 		es = append(es, e)
 	}
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index f1dbc6f91..3f47b328d 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -71,7 +71,7 @@ filegroup(
 
 go_test(
     name = "tcp_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "dual_stack_test.go",
         "sack_scoreboard_test.go",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index cb0e13ebc..0e8e0a2b4 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -269,8 +269,8 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
-	cookie := l.createCookie(s.id, irs, encodeMSS(opts.MSS))
-	ep, err := l.createConnectingEndpoint(s, cookie, irs, opts)
+	isn := generateSecureISN(s.id, l.stack.Seed())
+	ep, err := l.createConnectingEndpoint(s, isn, irs, opts)
 	if err != nil {
 		return nil, err
 	}
@@ -289,7 +289,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	// Perform the 3-way handshake.
 	h := newHandshake(ep, seqnum.Size(ep.initialReceiveWindow()))
 
-	h.resetToSynRcvd(cookie, irs, opts)
+	h.resetToSynRcvd(isn, irs, opts)
 	if err := h.execute(); err != nil {
 		ep.Close()
 		if l.listenEP != nil {
@@ -361,6 +361,7 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 	defer decSynRcvdCount()
 	defer e.decSynRcvdCount()
 	defer s.decRef()
+
 	n, err := ctx.createEndpointAndPerformHandshake(s, opts)
 	if err != nil {
 		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
@@ -368,6 +369,11 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 		return
 	}
 	ctx.removePendingEndpoint(n)
+	// Start the protocol goroutine.
+	wq := &waiter.Queue{}
+	n.startAcceptedLoop(wq)
+	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
+
 	e.deliverAccepted(n)
 }
 
@@ -543,6 +549,11 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// number of goroutines as we do check before
 		// entering here that there was at least some
 		// space available in the backlog.
+
+		// Start the protocol goroutine.
+		wq := &waiter.Queue{}
+		n.startAcceptedLoop(wq)
+		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 		go e.deliverAccepted(n)
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index ca982c451..a114c06c1 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -15,6 +15,7 @@
 package tcp
 
 import (
+	"encoding/binary"
 	"sync"
 	"time"
 
@@ -22,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -139,7 +141,32 @@ func (h *handshake) resetState() {
 	h.flags = header.TCPFlagSyn
 	h.ackNum = 0
 	h.mss = 0
-	h.iss = seqnum.Value(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24)
+	h.iss = generateSecureISN(h.ep.ID, h.ep.stack.Seed())
+}
+
+// generateSecureISN generates a secure Initial Sequence number based on the
+// recommendation here https://tools.ietf.org/html/rfc6528#page-3.
+func generateSecureISN(id stack.TransportEndpointID, seed uint32) seqnum.Value {
+	isnHasher := jenkins.Sum32(seed)
+	isnHasher.Write([]byte(id.LocalAddress))
+	isnHasher.Write([]byte(id.RemoteAddress))
+	portBuf := make([]byte, 2)
+	binary.LittleEndian.PutUint16(portBuf, id.LocalPort)
+	isnHasher.Write(portBuf)
+	binary.LittleEndian.PutUint16(portBuf, id.RemotePort)
+	isnHasher.Write(portBuf)
+	// The time period here is 64ns. This is similar to what linux uses
+	// generate a sequence number that overlaps less than one
+	// time per MSL (2 minutes).
+	//
+	// A 64ns clock ticks 10^9/64 = 15625000) times in a second.
+	// To wrap the whole 32 bit space would require
+	// 2^32/1562500 ~ 274 seconds.
+	//
+	// Which sort of guarantees that we won't reuse the ISN for a new
+	// connection for the same tuple for at least 274s.
+	isn := isnHasher.Sum32() + uint32(time.Now().UnixNano()>>6)
+	return seqnum.Value(isn)
 }
 
 // effectiveRcvWndScale returns the effective receive window scale to be used.
@@ -809,7 +836,19 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	e.state = StateError
 	e.HardError = err
 	if err != tcpip.ErrConnectionReset {
-		e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, e.snd.sndUna, e.rcv.rcvNxt, 0)
+		// The exact sequence number to be used for the RST is the same as the
+		// one used by Linux. We need to handle the case of window being shrunk
+		// which can cause sndNxt to be outside the acceptable window on the
+		// receiver.
+		//
+		// See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more
+		// information.
+		sndWndEnd := e.snd.sndUna.Add(e.snd.sndWnd)
+		resetSeqNum := sndWndEnd
+		if !sndWndEnd.LessThan(e.snd.sndNxt) || e.snd.sndNxt.Size(sndWndEnd) < (1<<e.snd.sndWndScale) {
+			resetSeqNum = e.snd.sndNxt
+		}
+		e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.rcvNxt, 0)
 	}
 }
 
@@ -823,6 +862,51 @@ func (e *endpoint) completeWorkerLocked() {
 	}
 }
 
+func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
+	if e.rcv.acceptable(s.sequenceNumber, 0) {
+		// RFC 793, page 37 states that "in all states
+		// except SYN-SENT, all reset (RST) segments are
+		// validated by checking their SEQ-fields." So
+		// we only process it if it's acceptable.
+		s.decRef()
+		e.mu.Lock()
+		switch e.state {
+		// In case of a RST in CLOSE-WAIT linux moves
+		// the socket to closed state with an error set
+		// to indicate EPIPE.
+		//
+		// Technically this seems to be at odds w/ RFC.
+		// As per https://tools.ietf.org/html/rfc793#section-2.7
+		// page 69 the behavior for a segment arriving
+		// w/ RST bit set in CLOSE-WAIT is inlined below.
+		//
+		//  ESTABLISHED
+		//  FIN-WAIT-1
+		//  FIN-WAIT-2
+		//  CLOSE-WAIT
+
+		//  If the RST bit is set then, any outstanding RECEIVEs and
+		//  SEND should receive "reset" responses. All segment queues
+		//  should be flushed.  Users should also receive an unsolicited
+		//  general "connection reset" signal. Enter the CLOSED state,
+		//  delete the TCB, and return.
+		case StateCloseWait:
+			e.state = StateClose
+			e.HardError = tcpip.ErrAborted
+			// We need to set this explicitly here because otherwise
+			// the port registrations will not be released till the
+			// endpoint is actively closed by the application.
+			e.workerCleanup = true
+			e.mu.Unlock()
+			return false, nil
+		default:
+			e.mu.Unlock()
+			return false, tcpip.ErrConnectionReset
+		}
+	}
+	return true, nil
+}
+
 // handleSegments pulls segments from the queue and processes them. It returns
 // no error if the protocol loop should continue, an error otherwise.
 func (e *endpoint) handleSegments() *tcpip.Error {
@@ -840,14 +924,34 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 		}
 
 		if s.flagIsSet(header.TCPFlagRst) {
-			if e.rcv.acceptable(s.sequenceNumber, 0) {
-				// RFC 793, page 37 states that "in all states
-				// except SYN-SENT, all reset (RST) segments are
-				// validated by checking their SEQ-fields." So
-				// we only process it if it's acceptable.
-				s.decRef()
-				return tcpip.ErrConnectionReset
+			if ok, err := e.handleReset(s); !ok {
+				return err
 			}
+		} else if s.flagIsSet(header.TCPFlagSyn) {
+			// See: https://tools.ietf.org/html/rfc5961#section-4.1
+			//   1) If the SYN bit is set, irrespective of the sequence number, TCP
+			//    MUST send an ACK (also referred to as challenge ACK) to the remote
+			//    peer:
+			//
+			//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+			//
+			//    After sending the acknowledgment, TCP MUST drop the unacceptable
+			//    segment and stop processing further.
+			//
+			// By sending an ACK, the remote peer is challenged to confirm the loss
+			// of the previous connection and the request to start a new connection.
+			// A legitimate peer, after restart, would not have a TCB in the
+			// synchronized state.  Thus, when the ACK arrives, the peer should send
+			// a RST segment back with the sequence number derived from the ACK
+			// field that caused the RST.
+
+			// This RST will confirm that the remote peer has indeed closed the
+			// previous connection.  Upon receipt of a valid RST, the local TCP
+			// endpoint MUST terminate its connection.  The local TCP endpoint
+			// should then rely on SYN retransmission from the remote end to
+			// re-establish the connection.
+
+			e.snd.sendAck()
 		} else if s.flagIsSet(header.TCPFlagAck) {
 			// Patch the window size in the segment according to the
 			// send window scale.
@@ -856,7 +960,15 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 			// RFC 793, page 41 states that "once in the ESTABLISHED
 			// state all segments must carry current acknowledgment
 			// information."
-			e.rcv.handleRcvdSegment(s)
+			drop, err := e.rcv.handleRcvdSegment(s)
+			if err != nil {
+				s.decRef()
+				return err
+			}
+			if drop {
+				s.decRef()
+				continue
+			}
 			e.snd.handleRcvdSegment(s)
 		}
 		s.decRef()
@@ -955,7 +1067,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		}
 
 		e.mu.Unlock()
-
 		// When the protocol loop exits we should wake up our waiters.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}
@@ -1001,6 +1112,10 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		// RTT itself.
 		e.rcvAutoParams.prevCopied = initialRcvWnd
 		e.rcvListMu.Unlock()
+		e.stack.Stats().TCP.CurrentEstablished.Increment()
+		e.mu.Lock()
+		e.state = StateEstablished
+		e.mu.Unlock()
 	}
 
 	e.keepalive.timer.init(&e.keepalive.waker)
@@ -1008,10 +1123,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 	// Tell waiters that the endpoint is connected and writable.
 	e.mu.Lock()
-	if e.state != StateEstablished {
-		e.stack.Stats().TCP.CurrentEstablished.Increment()
-		e.state = StateEstablished
-	}
 	drained := e.drainDone != nil
 	e.mu.Unlock()
 	if drained {
@@ -1042,7 +1153,13 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		{
 			w: &closeWaker,
 			f: func() *tcpip.Error {
-				return tcpip.ErrConnectionAborted
+				// This means the socket is being closed due
+				// to the TCP_FIN_WAIT2 timeout was hit. Just
+				// mark the socket as closed.
+				e.mu.Lock()
+				e.state = StateClose
+				e.mu.Unlock()
+				return nil
 			},
 		},
 		{
@@ -1085,17 +1202,18 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
 					e.mu.Unlock()
 				}
+
 				if n&notifyClose != 0 && closeTimer == nil {
-					// Reset the connection 3 seconds after
-					// the endpoint has been closed.
-					//
-					// The timer could fire in background
-					// when the endpoint is drained. That's
-					// OK as the loop here will not honor
-					// the firing until the undrain arrives.
-					closeTimer = time.AfterFunc(3*time.Second, func() {
-						closeWaker.Assert()
-					})
+					e.mu.Lock()
+					if e.state == StateFinWait2 && e.closed {
+						// The socket has been closed and we are in FIN_WAIT2
+						// so start the FIN_WAIT2 timer.
+						closeTimer = time.AfterFunc(e.tcpLingerTimeout, func() {
+							closeWaker.Assert()
+						})
+						e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+					}
+					e.mu.Unlock()
 				}
 
 				if n&notifyKeepaliveChanged != 0 {
@@ -1117,6 +1235,12 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 					}
 				}
 
+				if n&notifyTickleWorker != 0 {
+					// Just a tickle notification. No need to do
+					// anything.
+					return nil
+				}
+
 				return nil
 			},
 		},
@@ -1143,15 +1267,16 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	}
 	e.rcvListMu.Unlock()
 
-	e.mu.RLock()
+	e.mu.Lock()
 	if e.workerCleanup {
 		e.notifyProtocolGoroutine(notifyClose)
 	}
-	e.mu.RUnlock()
 
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
-	for !e.rcv.closed || !e.snd.closed || e.snd.sndUna != e.snd.sndNxtList {
+
+	for e.state != StateTimeWait && e.state != StateClose && e.state != StateError {
+		e.mu.Unlock()
 		e.workMu.Unlock()
 		v, _ := s.Fetch(true)
 		e.workMu.Lock()
@@ -1167,6 +1292,23 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 			return nil
 		}
+		e.mu.Lock()
+	}
+
+	state := e.state
+	e.mu.Unlock()
+	var reuseTW func()
+	if state == StateTimeWait {
+		// Disable close timer as we now entering real TIME_WAIT.
+		if closeTimer != nil {
+			closeTimer.Stop()
+		}
+		// Mark the current sleeper done so as to free all associated
+		// wakers.
+		s.Done()
+		// Wake up any waiters before we enter TIME_WAIT.
+		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+		reuseTW = e.doTimeWait()
 	}
 
 	// Mark endpoint as closed.
@@ -1176,8 +1318,130 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		e.stack.Stats().TCP.CurrentEstablished.Decrement()
 		e.state = StateClose
 	}
+
 	// Lock released below.
 	epilogue()
 
+	// A new SYN was received during TIME_WAIT and we need to abort
+	// the timewait and redirect the segment to the listener queue
+	if reuseTW != nil {
+		reuseTW()
+	}
+
 	return nil
 }
+
+// handleTimeWaitSegments processes segments received during TIME_WAIT
+// state.
+func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) {
+	checkRequeue := true
+	for i := 0; i < maxSegmentsPerWake; i++ {
+		s := e.segmentQueue.dequeue()
+		if s == nil {
+			checkRequeue = false
+			break
+		}
+		extTW, newSyn := e.rcv.handleTimeWaitSegment(s)
+		if newSyn {
+			info := e.EndpointInfo.TransportEndpointInfo
+			newID := info.ID
+			newID.RemoteAddress = ""
+			newID.RemotePort = 0
+			netProtos := []tcpip.NetworkProtocolNumber{info.NetProto}
+			// If the local address is an IPv4 address then also
+			// look for IPv6 dual stack endpoints that might be
+			// listening on the local address.
+			if newID.LocalAddress.To4() != "" {
+				netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber}
+			}
+			for _, netProto := range netProtos {
+				if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, &s.route); listenEP != nil {
+					tcpEP := listenEP.(*endpoint)
+					if EndpointState(tcpEP.State()) == StateListen {
+						reuseTW = func() {
+							tcpEP.enqueueSegment(s)
+						}
+						// We explicitly do not decRef
+						// the segment as it's still
+						// valid and being reflected to
+						// a listening endpoint.
+						return false, reuseTW
+					}
+				}
+			}
+		}
+		if extTW {
+			extendTimeWait = true
+		}
+		s.decRef()
+	}
+	if checkRequeue && !e.segmentQueue.empty() {
+		e.newSegmentWaker.Assert()
+	}
+	return extendTimeWait, nil
+}
+
+// doTimeWait is responsible for handling the TCP behaviour once a socket
+// enters the TIME_WAIT state. Optionally it can return a closure that
+// should be executed after releasing the endpoint registrations. This is
+// done in cases where a new SYN is received during TIME_WAIT that carries
+// a sequence number larger than one see on the connection.
+func (e *endpoint) doTimeWait() (twReuse func()) {
+	// Trigger a 2 * MSL time wait state. During this period
+	// we will drop all incoming segments.
+	// NOTE: On Linux this is not configurable and is fixed at 60 seconds.
+	timeWaitDuration := DefaultTCPTimeWaitTimeout
+
+	// Get the stack wide configuration.
+	var tcpTW tcpip.TCPTimeWaitTimeoutOption
+	if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil {
+		timeWaitDuration = time.Duration(tcpTW)
+	}
+
+	const newSegment = 1
+	const notification = 2
+	const timeWaitDone = 3
+
+	s := sleep.Sleeper{}
+	s.AddWaker(&e.newSegmentWaker, newSegment)
+	s.AddWaker(&e.notificationWaker, notification)
+
+	var timeWaitWaker sleep.Waker
+	s.AddWaker(&timeWaitWaker, timeWaitDone)
+	timeWaitTimer := time.AfterFunc(timeWaitDuration, timeWaitWaker.Assert)
+	defer timeWaitTimer.Stop()
+
+	for {
+		e.workMu.Unlock()
+		v, _ := s.Fetch(true)
+		e.workMu.Lock()
+		switch v {
+		case newSegment:
+			extendTimeWait, reuseTW := e.handleTimeWaitSegments()
+			if reuseTW != nil {
+				return reuseTW
+			}
+			if extendTimeWait {
+				timeWaitTimer.Reset(timeWaitDuration)
+			}
+		case notification:
+			n := e.fetchNotifications()
+			if n&notifyClose != 0 {
+				return nil
+			}
+			if n&notifyDrain != 0 {
+				for !e.segmentQueue.empty() {
+					// Ignore extending TIME_WAIT during a
+					// save. For sockets in TIME_WAIT we just
+					// terminate the TIME_WAIT early.
+					e.handleTimeWaitSegments()
+				}
+				close(e.drainDone)
+				<-e.undrain
+				return nil
+			}
+		case timeWaitDone:
+			return nil
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 79fec6b77..04c92c04c 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -121,6 +121,11 @@ const (
 	notifyReset
 	notifyKeepaliveChanged
 	notifyMSSChanged
+	// notifyTickleWorker is used to tickle the protocol main loop during a
+	// restore after we update the endpoint state to the correct one. This
+	// ensures the loop terminates if the final state of the endpoint is
+	// say TIME_WAIT.
+	notifyTickleWorker
 )
 
 // SACKInfo holds TCP SACK related information for a given endpoint.
@@ -320,6 +325,11 @@ type endpoint struct {
 
 	state EndpointState `state:".(EndpointState)"`
 
+	// origEndpointState is only used during a restore phase to save the
+	// endpoint state at restore time as the socket is moved to it's correct
+	// state.
+	origEndpointState EndpointState `state:"nosave"`
+
 	isPortReserved    bool `state:"manual"`
 	isRegistered      bool
 	boundNICID        tcpip.NICID `state:"manual"`
@@ -503,6 +513,16 @@ type endpoint struct {
 
 	// TODO(b/142022063): Add ability to save and restore per endpoint stats.
 	stats Stats `state:"nosave"`
+
+	// tcpLingerTimeout is the maximum amount of a time a socket
+	// a socket stays in TIME_WAIT state before being marked
+	// closed.
+	tcpLingerTimeout time.Duration
+
+	// closed indicates that the user has called closed on the
+	// endpoint and at this point the endpoint is only around
+	// to complete the TCP shutdown.
+	closed bool
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -599,6 +619,11 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		e.SetSockOptInt(tcpip.DelayOption, 1)
 	}
 
+	var tcpLT tcpip.TCPLingerTimeoutOption
+	if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
+		e.tcpLingerTimeout = time.Duration(tcpLT)
+	}
+
 	if p := s.GetTCPProbe(); p != nil {
 		e.probe = p
 	}
@@ -686,6 +711,13 @@ func (e *endpoint) notifyProtocolGoroutine(n uint32) {
 // with it. It must be called only once and with no other concurrent calls to
 // the endpoint.
 func (e *endpoint) Close() {
+	e.mu.Lock()
+	closed := e.closed
+	e.mu.Unlock()
+	if closed {
+		return
+	}
+
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
 	e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
@@ -706,6 +738,8 @@ func (e *endpoint) Close() {
 		e.isPortReserved = false
 	}
 
+	// Mark endpoint as closed.
+	e.closed = true
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
 	tcpip.AddDanglingEndpoint(e)
@@ -731,9 +765,7 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 	go func() {
 		defer close(done)
 		for n := range e.acceptedChan {
-			n.mu.Lock()
-			n.resetConnectionLocked(tcpip.ErrConnectionAborted)
-			n.mu.Unlock()
+			n.notifyProtocolGoroutine(notifyReset)
 			n.Close()
 		}
 	}()
@@ -1349,6 +1381,28 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.TCPLingerTimeoutOption:
+		e.mu.Lock()
+		if v < 0 {
+			// Same as effectively disabling TCPLinger timeout.
+			v = 0
+		}
+		var stkTCPLingerTimeout tcpip.TCPLingerTimeoutOption
+		if err := e.stack.TransportProtocolOption(header.TCPProtocolNumber, &stkTCPLingerTimeout); err != nil {
+			// We were unable to retrieve a stack config, just use
+			// the DefaultTCPLingerTimeout.
+			if v > tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout) {
+				stkTCPLingerTimeout = tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout)
+			}
+		}
+		// Cap it to the stack wide TCPLinger timeout.
+		if v > stkTCPLingerTimeout {
+			v = stkTCPLingerTimeout
+		}
+		e.tcpLingerTimeout = time.Duration(v)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return nil
 	}
@@ -1562,6 +1616,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.RUnlock()
 		return nil
 
+	case *tcpip.TCPLingerTimeoutOption:
+		e.mu.Lock()
+		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -1696,7 +1756,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		// src IP to ensure that for a given tuple (srcIP, destIP,
 		// destPort) the offset used as a starting point is the same to
 		// ensure that we can cycle through the port space effectively.
-		h := jenkins.Sum32(e.stack.PortSeed())
+		h := jenkins.Sum32(e.stack.Seed())
 		h.Write([]byte(e.ID.LocalAddress))
 		h.Write([]byte(e.ID.RemoteAddress))
 		portBuf := make([]byte, 2)
@@ -1782,9 +1842,8 @@ func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
 // peer.
 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 	e.mu.Lock()
-	defer e.mu.Unlock()
 	e.shutdownFlags |= flags
-
+	finQueued := false
 	switch {
 	case e.state.connected():
 		// Close for read.
@@ -1799,6 +1858,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			// the connection with a RST.
 			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
 				e.notifyProtocolGoroutine(notifyReset)
+				e.mu.Unlock()
 				return nil
 			}
 		}
@@ -1817,14 +1877,11 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			s := newSegmentFromView(&e.route, e.ID, nil)
 			e.sndQueue.PushBack(s)
 			e.sndBufInQueue++
-
+			finQueued = true
 			// Mark endpoint as closed.
 			e.sndClosed = true
 
 			e.sndBufMu.Unlock()
-
-			// Tell protocol goroutine to close.
-			e.sndCloseWaker.Assert()
 		}
 
 	case e.state == StateListen:
@@ -1832,11 +1889,20 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 		if flags&tcpip.ShutdownRead != 0 {
 			e.notifyProtocolGoroutine(notifyClose)
 		}
-
 	default:
+		e.mu.Unlock()
 		return tcpip.ErrNotConnected
 	}
-
+	e.mu.Unlock()
+	if finQueued {
+		if e.workMu.TryLock() {
+			e.handleClose()
+			e.workMu.Unlock()
+		} else {
+			// Tell protocol goroutine to close.
+			e.sndCloseWaker.Assert()
+		}
+	}
 	return nil
 }
 
@@ -1928,12 +1994,7 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 		return nil, nil, tcpip.ErrWouldBlock
 	}
 
-	// Start the protocol goroutine.
-	wq := &waiter.Queue{}
-	n.startAcceptedLoop(wq)
-	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
-
-	return n, wq, nil
+	return n, n.waiterQueue, nil
 }
 
 // Bind binds the endpoint to a specific local port and optionally address.
@@ -2058,6 +2119,10 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		e.stack.Stats().TCP.ResetsReceived.Increment()
 	}
 
+	e.enqueueSegment(s)
+}
+
+func (e *endpoint) enqueueSegment(s *segment) {
 	// Send packet to worker goroutine.
 	if e.segmentQueue.enqueue(s) {
 		e.newSegmentWaker.Assert()
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 19f003b6b..7aa4c3f0e 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -78,7 +78,7 @@ func (e *endpoint) beforeSave() {
 		}
 		fallthrough
 	case StateError, StateClose:
-		for e.state == StateError && e.workerRunning {
+		for (e.state == StateError || e.state == StateClose) && e.workerRunning {
 			e.mu.Unlock()
 			time.Sleep(100 * time.Millisecond)
 			e.mu.Lock()
@@ -165,6 +165,12 @@ func (e *endpoint) loadState(state EndpointState) {
 
 // afterLoad is invoked by stateify.
 func (e *endpoint) afterLoad() {
+	// Freeze segment queue before registering to prevent any segments
+	// from being delivered while it is being restored.
+	e.origEndpointState = e.state
+	// Restore the endpoint to InitialState as it will be moved to
+	// its origEndpointState during Resume.
+	e.state = StateInitial
 	stack.StackFromEnv.RegisterRestoredEndpoint(e)
 }
 
@@ -173,8 +179,8 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	e.stack = s
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
 	e.workMu.Init()
+	state := e.origEndpointState
 
-	state := e.state
 	switch state {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
 		var ss SendBufferSizeOption
@@ -189,7 +195,6 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	}
 
 	bind := func() {
-		e.state = StateInitial
 		if len(e.BindAddr) == 0 {
 			e.BindAddr = e.ID.LocalAddress
 		}
@@ -219,6 +224,16 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		if err := e.connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.ID.RemotePort}, false, e.workerRunning); err != tcpip.ErrConnectStarted {
 			panic("endpoint connecting failed: " + err.String())
 		}
+		e.mu.Lock()
+		e.state = e.origEndpointState
+		closed := e.closed
+		e.mu.Unlock()
+		e.notifyProtocolGoroutine(notifyTickleWorker)
+		if state == StateFinWait2 && closed {
+			// If the endpoint has been closed then make sure we notify so
+			// that the FIN_WAIT2 timer is started after a restore.
+			e.notifyProtocolGoroutine(notifyClose)
+		}
 		connectedLoading.Done()
 	case StateListen:
 		tcpip.AsyncLoading.Add(1)
@@ -265,8 +280,11 @@ func (e *endpoint) Resume(s *stack.Stack) {
 				tcpip.AsyncLoading.Done()
 			}()
 		}
-		fallthrough
+		e.state = StateClose
+		e.stack.CompleteTransportEndpointCleanup(e)
+		tcpip.DeleteDanglingEndpoint(e)
 	case StateError:
+		e.state = StateError
 		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
 	}
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index c8e4a0d7e..89b965c23 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -23,6 +23,7 @@ package tcp
 import (
 	"strings"
 	"sync"
+	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -54,6 +55,14 @@ const (
 	// MaxUnprocessedSegments is the maximum number of unprocessed segments
 	// that can be queued for a given endpoint.
 	MaxUnprocessedSegments = 300
+
+	// DefaultTCPLingerTimeout is the amount of time that sockets linger in
+	// FIN_WAIT_2 state before being marked closed.
+	DefaultTCPLingerTimeout = 60 * time.Second
+
+	// DefaultTCPTimeWaitTimeout is the amount of time that sockets linger
+	// in TIME_WAIT state before being marked closed.
+	DefaultTCPTimeWaitTimeout = 60 * time.Second
 )
 
 // SACKEnabled option can be used to enable SACK support in the TCP
@@ -93,6 +102,8 @@ type protocol struct {
 	congestionControl          string
 	availableCongestionControl []string
 	moderateReceiveBuffer      bool
+	tcpLingerTimeout           time.Duration
+	tcpTimeWaitTimeout         time.Duration
 }
 
 // Number returns the tcp protocol number.
@@ -212,6 +223,24 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 		p.mu.Unlock()
 		return nil
 
+	case tcpip.TCPLingerTimeoutOption:
+		if v < 0 {
+			v = 0
+		}
+		p.mu.Lock()
+		p.tcpLingerTimeout = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPTimeWaitTimeoutOption:
+		if v < 0 {
+			v = 0
+		}
+		p.mu.Lock()
+		p.tcpTimeWaitTimeout = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -262,6 +291,18 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 		p.mu.Unlock()
 		return nil
 
+	case *tcpip.TCPLingerTimeoutOption:
+		p.mu.Lock()
+		*v = tcpip.TCPLingerTimeoutOption(p.tcpLingerTimeout)
+		p.mu.Unlock()
+		return nil
+
+	case *tcpip.TCPTimeWaitTimeoutOption:
+		p.mu.Lock()
+		*v = tcpip.TCPTimeWaitTimeoutOption(p.tcpTimeWaitTimeout)
+		p.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -274,5 +315,7 @@ func NewProtocol() stack.TransportProtocol {
 		recvBufferSize:             ReceiveBufferSizeOption{MinBufferSize, DefaultReceiveBufferSize, MaxBufferSize},
 		congestionControl:          ccReno,
 		availableCongestionControl: []string{ccReno, ccCubic},
+		tcpLingerTimeout:           DefaultTCPLingerTimeout,
+		tcpTimeWaitTimeout:         DefaultTCPTimeWaitTimeout,
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index e90f9a7d9..068b90fb6 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -18,6 +18,7 @@ import (
 	"container/heap"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 )
@@ -209,6 +210,11 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		switch r.ep.state {
 		case StateFinWait1:
 			r.ep.state = StateFinWait2
+			// Notify protocol goroutine that we have received an
+			// ACK to our FIN so that it can start the FIN_WAIT2
+			// timer to abort connection if the other side does
+			// not close within 2MSL.
+			r.ep.notifyProtocolGoroutine(notifyClose)
 		case StateClosing:
 			r.ep.state = StateTimeWait
 		case StateLastAck:
@@ -253,23 +259,105 @@ func (r *receiver) updateRTT() {
 	r.ep.rcvListMu.Unlock()
 }
 
-// handleRcvdSegment handles TCP segments directed at the connection managed by
-// r as they arrive. It is called by the protocol main loop.
-func (r *receiver) handleRcvdSegment(s *segment) {
+func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, closed bool) (drop bool, err *tcpip.Error) {
+	r.ep.rcvListMu.Lock()
+	rcvClosed := r.ep.rcvClosed || r.closed
+	r.ep.rcvListMu.Unlock()
+
+	// If we are in one of the shutdown states then we need to do
+	// additional checks before we try and process the segment.
+	switch state {
+	case StateCloseWait, StateClosing, StateLastAck:
+		if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
+			s.decRef()
+			// Just drop the segment as we have
+			// already received a FIN and this
+			// segment is after the sequence number
+			// for the FIN.
+			return true, nil
+		}
+		fallthrough
+	case StateFinWait1:
+		fallthrough
+	case StateFinWait2:
+		// If we are closed for reads (either due to an
+		// incoming FIN or the user calling shutdown(..,
+		// SHUT_RD) then any data past the rcvNxt should
+		// trigger a RST.
+		endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
+		if rcvClosed && r.rcvNxt.LessThan(endDataSeq) {
+			s.decRef()
+			return true, tcpip.ErrConnectionAborted
+		}
+		if state == StateFinWait1 {
+			break
+		}
+
+		// If it's a retransmission of an old data segment
+		// or a pure ACK then allow it.
+		if s.sequenceNumber.Add(s.logicalLen()).LessThanEq(r.rcvNxt) ||
+			s.logicalLen() == 0 {
+			break
+		}
+
+		// In FIN-WAIT2 if the socket is fully
+		// closed(not owned by application on our end
+		// then the only acceptable segment is a
+		// FIN. Since FIN can technically also carry
+		// data we verify that the segment carrying a
+		// FIN ends at exactly e.rcvNxt+1.
+		//
+		// From RFC793 page 25.
+		//
+		// For sequence number purposes, the SYN is
+		// considered to occur before the first actual
+		// data octet of the segment in which it occurs,
+		// while the FIN is considered to occur after
+		// the last actual data octet in a segment in
+		// which it occurs.
+		if closed && (!s.flagIsSet(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.rcvNxt+1) {
+			s.decRef()
+			return true, tcpip.ErrConnectionAborted
+		}
+	}
+
 	// We don't care about receive processing anymore if the receive side
 	// is closed.
-	if r.closed {
-		return
+	//
+	// NOTE: We still want to permit a FIN as it's possible only our
+	// end has closed and the peer is yet to send a FIN. Hence we
+	// compare only the payload.
+	segEnd := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
+	if rcvClosed && !segEnd.LessThanEq(r.rcvNxt) {
+		return true, nil
+	}
+	return false, nil
+}
+
+// handleRcvdSegment handles TCP segments directed at the connection managed by
+// r as they arrive. It is called by the protocol main loop.
+func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
+	r.ep.mu.RLock()
+	state := r.ep.state
+	closed := r.ep.closed
+	r.ep.mu.RUnlock()
+
+	if state != StateEstablished {
+		drop, err := r.handleRcvdSegmentClosing(s, state, closed)
+		if drop || err != nil {
+			return drop, err
+		}
 	}
 
 	segLen := seqnum.Size(s.data.Size())
 	segSeq := s.sequenceNumber
 
 	// If the sequence number range is outside the acceptable range, just
-	// send an ACK. This is according to RFC 793, page 37.
+	// send an ACK and stop further processing of the segment.
+	// This is according to RFC 793, page 68.
 	if !r.acceptable(segSeq, segLen) {
 		r.ep.snd.sendAck()
-		return
+		return true, nil
 	}
 
 	// Defer segment processing if it can't be consumed now.
@@ -288,7 +376,7 @@ func (r *receiver) handleRcvdSegment(s *segment) {
 			// have to retransmit.
 			r.ep.snd.sendAck()
 		}
-		return
+		return false, nil
 	}
 
 	// Since we consumed a segment update the receiver's RTT estimate
@@ -315,4 +403,67 @@ func (r *receiver) handleRcvdSegment(s *segment) {
 		r.pendingBufUsed -= s.logicalLen()
 		s.decRef()
 	}
+	return false, nil
+}
+
+// handleTimeWaitSegment handles inbound segments received when the endpoint
+// has entered the TIME_WAIT state.
+func (r *receiver) handleTimeWaitSegment(s *segment) (resetTimeWait bool, newSyn bool) {
+	segSeq := s.sequenceNumber
+	segLen := seqnum.Size(s.data.Size())
+
+	// Just silently drop any RST packets in TIME_WAIT. We do not support
+	// TIME_WAIT assasination as a result we confirm w/ fix 1 as described
+	// in https://tools.ietf.org/html/rfc1337#section-3.
+	if s.flagIsSet(header.TCPFlagRst) {
+		return false, false
+	}
+
+	// If it's a SYN and the sequence number is higher than any seen before
+	// for this connection then try and redirect it to a listening endpoint
+	// if available.
+	//
+	// RFC 1122:
+	//   "When a connection is [...] on TIME-WAIT state [...]
+	//   [a TCP] MAY accept a new SYN from the remote TCP to
+	//   reopen the connection directly, if it:
+
+	//    (1) assigns its initial sequence number for the new
+	//     connection to be larger than the largest sequence
+	//     number it used on the previous connection incarnation,
+	//     and
+
+	//    (2) returns to TIME-WAIT state if the SYN turns out
+	//      to be an old duplicate".
+	if s.flagIsSet(header.TCPFlagSyn) && r.rcvNxt.LessThan(segSeq) {
+
+		return false, true
+	}
+
+	// Drop the segment if it does not contain an ACK.
+	if !s.flagIsSet(header.TCPFlagAck) {
+		return false, false
+	}
+
+	// Update Timestamp if required. See RFC7323, section-4.3.
+	if r.ep.sendTSOk && s.parsedOptions.TS {
+		r.ep.updateRecentTimestamp(s.parsedOptions.TSVal, r.ep.snd.maxSentAck, segSeq)
+	}
+
+	if segSeq.Add(1) == r.rcvNxt && s.flagIsSet(header.TCPFlagFin) {
+		// If it's a FIN-ACK then resetTimeWait and send an ACK, as it
+		// indicates our final ACK could have been lost.
+		r.ep.snd.sendAck()
+		return true, false
+	}
+
+	// If the sequence number range is outside the acceptable range or
+	// carries data then just send an ACK. This is according to RFC 793,
+	// page 37.
+	//
+	// NOTE: In TIME_WAIT the only acceptable sequence number is rcvNxt.
+	if segSeq != r.rcvNxt || segLen != 0 {
+		r.ep.snd.sendAck()
+	}
+	return false, false
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index f4ea5f091..0c1704d74 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -206,17 +206,18 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
+	// Set TCPLingerTimeout to 5 seconds so that sockets are marked closed
 	wq := &waiter.Queue{}
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	if err := ep.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Send a SYN request.
@@ -256,7 +257,7 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 		case <-ch:
 			c.EP, _, err = ep.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -264,6 +265,13 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 		}
 	}
 
+	// Lower stackwide TIME_WAIT timeout so that the reservations
+	// are released instantly on Close.
+	tcpTW := tcpip.TCPTimeWaitTimeoutOption(1 * time.Millisecond)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpTW); err != nil {
+		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %s) = %s", tcp.ProtocolNumber, tcpTW, err)
+	}
+
 	c.EP.Close()
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.SrcPort(context.StackPort),
@@ -285,6 +293,11 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	// Get the ACK to the FIN we just sent.
 	c.GetPacket()
 
+	// Since an active close was done we need to wait for a little more than
+	// tcpLingerTimeout for the port reservations to be released and the
+	// socket to move to a CLOSED state.
+	time.Sleep(20 * time.Millisecond)
+
 	// Now resend the same ACK, this ACK should generate a RST as there
 	// should be no endpoint in SYN-RCVD state and we are not using
 	// syn-cookies yet. The reason we send the same ACK is we need a valid
@@ -376,6 +389,13 @@ func TestConnectResetAfterClose(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
+	// Set TCPLinger to 3 seconds so that sockets are marked closed
+	// after 3 second in FIN_WAIT2 state.
+	tcpLingerTimeout := 3 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPLingerTimeoutOption(tcpLingerTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpLingerTimeout, err)
+	}
+
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	ep := c.EP
 	c.EP = nil
@@ -396,12 +416,24 @@ func TestConnectResetAfterClose(t *testing.T) {
 		DstPort: c.Port,
 		Flags:   header.TCPFlagAck,
 		SeqNum:  790,
-		AckNum:  c.IRS.Add(1),
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Wait for the ep to give up waiting for a FIN.
+	time.Sleep(tcpLingerTimeout + 1*time.Second)
+
+	// Now send an ACK and it should trigger a RST as the endpoint should
+	// not exist anymore.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(2),
 		RcvWnd:  30000,
 	})
 
-	// Wait for the ep to give up waiting for a FIN, and send a RST.
-	time.Sleep(3 * time.Second)
 	for {
 		b := c.GetPacket()
 		tcpHdr := header.TCP(header.IPv4(b).Payload())
@@ -413,7 +445,7 @@ func TestConnectResetAfterClose(t *testing.T) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
-				checker.SeqNum(uint32(c.IRS)+1),
+				checker.SeqNum(uint32(c.IRS)+2),
 				checker.AckNum(790),
 				checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
 			),
@@ -1110,8 +1142,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
-			// We shouldn't consume a sequence number on RST.
-			checker.SeqNum(uint32(c.IRS)+1),
+			checker.SeqNum(uint32(c.IRS)+2),
 		))
 	// The RST puts the endpoint into an error state.
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
@@ -3085,6 +3116,13 @@ func TestReadAfterClosedState(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
+	// Set TCPTimeWaitTimeout to 1 seconds so that sockets are marked closed
+	// after 1 second in TIME_WAIT state.
+	tcpTimeWaitTimeout := 1 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPTimeWaitTimeout(%d) failed: %s", tcpTimeWaitTimeout, err)
+	}
+
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	we, ch := waiter.NewChannelEntry(nil)
@@ -3092,12 +3130,12 @@ func TestReadAfterClosedState(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Shutdown immediately for write, check that we get a FIN.
 	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
-		t.Fatalf("Shutdown failed: %v", err)
+		t.Fatalf("Shutdown failed: %s", err)
 	}
 
 	checker.IPv4(t, c.GetPacket(),
@@ -3135,10 +3173,9 @@ func TestReadAfterClosedState(t *testing.T) {
 		),
 	)
 
-	// Give the stack the chance to transition to closed state. Note that since
-	// both the sender and receiver are now closed, we effectively skip the
-	// TIME-WAIT state.
-	time.Sleep(1 * time.Second)
+	// Give the stack the chance to transition to closed state from
+	// TIME_WAIT.
+	time.Sleep(tcpTimeWaitTimeout * 2)
 
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateClose; got != want {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
@@ -3155,7 +3192,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	peekBuf := make([]byte, 10)
 	n, _, err := c.EP.Peek([][]byte{peekBuf})
 	if err != nil {
-		t.Fatalf("Peek failed: %v", err)
+		t.Fatalf("Peek failed: %s", err)
 	}
 
 	peekBuf = peekBuf[:n]
@@ -3166,7 +3203,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	// Receive data.
 	v, _, err := c.EP.Read(nil)
 	if err != nil {
-		t.Fatalf("Read failed: %v", err)
+		t.Fatalf("Read failed: %s", err)
 	}
 
 	if !bytes.Equal(data, v) {
@@ -3176,11 +3213,11 @@ func TestReadAfterClosedState(t *testing.T) {
 	// Now that we drained the queue, check that functions fail with the
 	// right error code.
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrClosedForReceive {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrClosedForReceive)
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrClosedForReceive)
 	}
 
 	if _, _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive {
-		t.Fatalf("got c.EP.Peek(...) = %v, want = %v", err, tcpip.ErrClosedForReceive)
+		t.Fatalf("got c.EP.Peek(...) = %v, want = %s", err, tcpip.ErrClosedForReceive)
 	}
 }
 
@@ -4347,7 +4384,8 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	// Send a SYN request.
 	irs := seqnum.Value(789)
 	c.SendPacket(nil, &context.Headers{
-		SrcPort: context.TestPort,
+		// pick a different src port for new SYN.
+		SrcPort: context.TestPort + 1,
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  irs,
@@ -4893,3 +4931,545 @@ func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.Del
 		t.Errorf("ep.GetSockOptInt(tcpip.DelayOption) got: %d, want: %d", gotDelayOption, wantDelayOption)
 	}
 }
+
+func TestTCPLingerTimeout(t *testing.T) {
+	c := context.New(t, 1500 /* mtu */)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	testCases := []struct {
+		name             string
+		tcpLingerTimeout time.Duration
+		want             time.Duration
+	}{
+		{"NegativeLingerTimeout", -123123, 0},
+		{"ZeroLingerTimeout", 0, 0},
+		{"InRangeLingerTimeout", 10 * time.Second, 10 * time.Second},
+		// Values > stack's TCPLingerTimeout are capped to the stack's
+		// value. Defaults to tcp.DefaultTCPLingerTimeout(60 seconds)
+		{"AboveMaxLingerTimeout", 65 * time.Second, 60 * time.Second},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if err := c.EP.SetSockOpt(tcpip.TCPLingerTimeoutOption(tc.tcpLingerTimeout)); err != nil {
+				t.Fatalf("SetSockOpt(%s) = %s", tc.tcpLingerTimeout, err)
+			}
+			var v tcpip.TCPLingerTimeoutOption
+			if err := c.EP.GetSockOpt(&v); err != nil {
+				t.Fatalf("GetSockOpt(tcpip.TCPLingerTimeoutOption) = %s", err)
+			}
+			if got, want := time.Duration(v), tc.want; got != want {
+				t.Fatalf("unexpected linger timeout got: %s, want: %s", got, want)
+			}
+		})
+	}
+}
+
+func TestTCPTimeWaitRSTIgnored(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Now send a RST and this should be ignored and not
+	// generate an ACK.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagRst,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	})
+
+	c.CheckNoPacketTimeout("unexpected packet received in TIME_WAIT state", 1*time.Second)
+
+	// Out of order ACK should generate an immediate ACK in
+	// TIME_WAIT.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 3,
+	})
+
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+}
+
+func TestTCPTimeWaitOutOfOrder(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Out of order ACK should generate an immediate ACK in
+	// TIME_WAIT.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 3,
+	})
+
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+}
+
+func TestTCPTimeWaitNewSyn(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Send a SYN request w/ sequence number lower than
+	// the highest sequence number sent. We just reuse
+	// the same number.
+	iss = seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	c.CheckNoPacketTimeout("unexpected packet received in response to SYN", 1*time.Second)
+
+	// Send a SYN request w/ sequence number higher than
+	// the highest sequence number sent.
+	iss = seqnum.Value(792)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b = c.GetPacket()
+	tcpHdr = header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+}
+
+func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
+	// after 5 seconds in TIME_WAIT state.
+	tcpTimeWaitTimeout := 5 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	}
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	time.Sleep(2 * time.Second)
+
+	// Now send a duplicate FIN. This should cause the TIME_WAIT to extend
+	// by another 5 seconds and also send us a duplicate ACK as it should
+	// indicate that the final ACK was potentially lost.
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Sleep for 4 seconds so at this point we are 1 second past the
+	// original tcpLingerTimeout of 5 seconds.
+	time.Sleep(4 * time.Second)
+
+	// Send an ACK and it should not generate any packet as the socket
+	// should still be in TIME_WAIT for another another 5 seconds due
+	// to the duplicate FIN we sent earlier.
+	*ackHeaders = *finHeaders
+	ackHeaders.SeqNum = ackHeaders.SeqNum + 1
+	ackHeaders.Flags = header.TCPFlagAck
+	c.SendPacket(nil, ackHeaders)
+
+	c.CheckNoPacketTimeout("unexpected packet received from endpoint in TIME_WAIT", 1*time.Second)
+	// Now sleep for another 2 seconds so that we are past the
+	// extended TIME_WAIT of 7 seconds (2 + 5).
+	time.Sleep(2 * time.Second)
+
+	// Resend the same ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Receive the RST that should be generated as there is no valid
+	// endpoint.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(ackHeaders.AckNum)),
+		checker.AckNum(uint32(ackHeaders.SeqNum)),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 3e5b6b3c3..722d14b53 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -9,7 +9,7 @@ syscall_test(test = "//test/syscalls/linux:accept_bind_stream_test")
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:accept_bind_test",
 )
 
@@ -434,7 +434,7 @@ syscall_test(
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_abstract_test",
 )
 
@@ -445,7 +445,7 @@ syscall_test(
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_domain_test",
 )
 
@@ -458,19 +458,19 @@ syscall_test(
 syscall_test(
     size = "large",
     add_overlay = True,
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_filesystem_test",
 )
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_inet_loopback_test",
 )
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_generic_loopback_test",
 )
 
@@ -481,13 +481,13 @@ syscall_test(
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_loopback_test",
 )
 
 syscall_test(
     size = "medium",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_udp_generic_loopback_test",
 )
 
@@ -498,7 +498,7 @@ syscall_test(
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_udp_loopback_test",
 )
 
@@ -560,7 +560,7 @@ syscall_test(
 syscall_test(
     size = "large",
     add_overlay = True,
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_unix_pair_test",
 )
 
@@ -599,7 +599,7 @@ syscall_test(
 
 syscall_test(
     size = "large",
-    shard_count = 10,
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_unix_unbound_stream_test",
 )
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 93bff8299..f8b8cb724 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2141,6 +2141,7 @@ cc_library(
     deps = [
         ":socket_test_util",
         "//test/util:test_util",
+        "//test/util:thread_util",
         "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index ab375aaaf..2eeee352e 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <arpa/inet.h>
+#include <linux/tcp.h>
 #include <netinet/in.h>
 #include <poll.h>
 #include <string.h>
@@ -31,6 +32,7 @@
 #include "gtest/gtest.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
@@ -267,6 +269,340 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog) {
   }
 }
 
+// TCPFinWait2Test creates a pair of connected sockets then closes one end to
+// trigger FIN_WAIT2 state for the closed endpoint. Then it binds the same local
+// IP/port on a new socket and tries to connect. The connect should fail w/
+// an EADDRINUSE. Then we wait till the FIN_WAIT2 timeout is over and try the
+// connect again with a new socket and this time it should succeed.
+//
+// TCP timers are not S/R today, this can cause this test to be flaky when run
+// under random S/R due to timer being reset on a restore.
+TEST_P(SocketInetLoopbackTest, TCPFinWait2Test_NoRandomSave) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Lower FIN_WAIT2 state to 5 seconds for test.
+  constexpr int kTCPLingerTimeout = 5;
+  EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
+                         &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+              SyscallSucceedsWithValue(0));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // Get the address/port bound by the connecting socket.
+  sockaddr_storage conn_bound_addr;
+  socklen_t conn_addrlen = connector.addr_len;
+  ASSERT_THAT(
+      getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+                  &conn_addrlen),
+      SyscallSucceeds());
+
+  // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
+  conn_fd.reset();
+
+  // Now bind and connect a new socket.
+  const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Disable cooperative saves after this point. As a save between the first
+  // bind/connect and the second one can cause the linger timeout timer to
+  // be restarted causing the final bind/connect to fail.
+  DisableSave ds;
+
+  // TODO(gvisor.dev/issue/1030): Portmanager does not track all 5 tuple
+  //   reservations which causes the bind() to succeed on gVisor but connect
+  //   correctly fails.
+  if (IsRunningOnGvisor()) {
+    ASSERT_THAT(
+        bind(conn_fd2.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+             conn_addrlen),
+        SyscallSucceeds());
+    ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+                                    reinterpret_cast<sockaddr*>(&conn_addr),
+                                    conn_addrlen),
+                SyscallFailsWithErrno(EADDRINUSE));
+  } else {
+    ASSERT_THAT(
+        bind(conn_fd2.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+             conn_addrlen),
+        SyscallFailsWithErrno(EADDRINUSE));
+  }
+
+  // Sleep for a little over the linger timeout to reduce flakiness in
+  // save/restore tests.
+  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+
+  ds.reset();
+
+  if (!IsRunningOnGvisor()) {
+    ASSERT_THAT(
+        bind(conn_fd2.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+             conn_addrlen),
+        SyscallSucceeds());
+  }
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  conn_addrlen),
+              SyscallSucceeds());
+}
+
+// TCPLinger2TimeoutAfterClose creates a pair of connected sockets
+// then closes one end to trigger FIN_WAIT2 state for the closed endpont.
+// It then sleeps for the TCP_LINGER2 timeout and verifies that bind/
+// connecting the same address succeeds.
+//
+// TCP timers are not S/R today, this can cause this test to be flaky when run
+// under random S/R due to timer being reset on a restore.
+TEST_P(SocketInetLoopbackTest, TCPLinger2TimeoutAfterClose_NoRandomSave) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // Get the address/port bound by the connecting socket.
+  sockaddr_storage conn_bound_addr;
+  socklen_t conn_addrlen = connector.addr_len;
+  ASSERT_THAT(
+      getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+                  &conn_addrlen),
+      SyscallSucceeds());
+
+  constexpr int kTCPLingerTimeout = 5;
+  EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
+                         &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+              SyscallSucceedsWithValue(0));
+
+  // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
+  conn_fd.reset();
+
+  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+
+  // Now bind and connect a new socket and verify that we can immediately
+  // rebind the address bound by the conn_fd as it never entered TIME_WAIT.
+  const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(bind(conn_fd2.get(),
+                   reinterpret_cast<sockaddr*>(&conn_bound_addr), conn_addrlen),
+              SyscallSucceeds());
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  conn_addrlen),
+              SyscallSucceeds());
+}
+
+// TCPResetAfterClose creates a pair of connected sockets then closes
+// one end to trigger FIN_WAIT2 state for the closed endpoint verifies
+// that we generate RSTs for any new data after the socket is fully
+// closed.
+TEST_P(SocketInetLoopbackTest, TCPResetAfterClose) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
+  conn_fd.reset();
+
+  int data = 1234;
+
+  // Now send data which should trigger a RST as the other end should
+  // have timed out and closed the socket.
+  EXPECT_THAT(RetryEINTR(send)(accepted.get(), &data, sizeof(data), 0),
+              SyscallSucceeds());
+  // Sleep for a shortwhile to get a RST back.
+  absl::SleepFor(absl::Seconds(1));
+
+  // Try writing again and we should get an EPIPE back.
+  EXPECT_THAT(RetryEINTR(send)(accepted.get(), &data, sizeof(data), 0),
+              SyscallFailsWithErrno(EPIPE));
+
+  // Trying to read should return zero as the other end did send
+  // us a FIN. We do it twice to verify that the RST does not cause an
+  // ECONNRESET on the read after EOF has been read by applicaiton.
+  EXPECT_THAT(RetryEINTR(recv)(accepted.get(), &data, sizeof(data), 0),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(RetryEINTR(recv)(accepted.get(), &data, sizeof(data), 0),
+              SyscallSucceedsWithValue(0));
+}
+
+// This test is disabled under random save as the the restore run
+// results in the stack.Seed() being different which can cause
+// sequence number of final connect to be one that is considered
+// old and can cause the test to be flaky.
+TEST_P(SocketInetLoopbackTest, TCPTimeWaitTest_NoRandomSave) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  // We disable saves after this point as a S/R causes the netstack seed
+  // to be regenerated which changes what ports/ISN is picked for a given
+  // tuple (src ip,src port, dst ip, dst port). This can cause the final
+  // SYN to use a sequence number that looks like one from the current
+  // connection in TIME_WAIT and will not be accepted causing the test
+  // to timeout.
+  //
+  // TODO(gvisor.dev/issue/940): S/R portSeed/portHint
+  DisableSave ds;
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // Get the address/port bound by the connecting socket.
+  sockaddr_storage conn_bound_addr;
+  socklen_t conn_addrlen = connector.addr_len;
+  ASSERT_THAT(
+      getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+                  &conn_addrlen),
+      SyscallSucceeds());
+
+  // close the accept FD to trigger TIME_WAIT on the accepted socket which
+  // should cause the conn_fd to follow CLOSE_WAIT->LAST_ACK->CLOSED instead of
+  // TIME_WAIT.
+  accepted.reset();
+  absl::SleepFor(absl::Seconds(1));
+  conn_fd.reset();
+  absl::SleepFor(absl::Seconds(1));
+
+  // Now bind and connect a new socket and verify that we can immediately
+  // rebind the address bound by the conn_fd as it never entered TIME_WAIT.
+  const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(bind(conn_fd2.get(),
+                   reinterpret_cast<sockaddr*>(&conn_bound_addr), conn_addrlen),
+              SyscallSucceeds());
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  conn_addrlen),
+              SyscallSucceeds());
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetLoopbackTest,
     ::testing::Values(
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 592448289..a37b49447 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -26,6 +26,7 @@
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
+#include "test/util/thread_util.h"
 
 namespace gvisor {
 namespace testing {
@@ -243,6 +244,31 @@ TEST_P(TCPSocketPairTest, ShutdownRdAllowsReadOfReceivedDataBeforeEOF) {
               SyscallSucceedsWithValue(0));
 }
 
+// This test verifies that a shutdown(wr) by the server after sending
+// data allows the client to still read() the queued data and a client
+// close after sending response allows server to read the incoming
+// response.
+TEST_P(TCPSocketPairTest, ShutdownWrServerClientClose) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  char buf[10] = {};
+  ScopedThread t([&]() {
+    ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
+    ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
+    ASSERT_THAT(close(sockets->release_first_fd()),
+                SyscallSucceedsWithValue(0));
+  });
+  ASSERT_THAT(RetryEINTR(write)(sockets->second_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+  ASSERT_THAT(RetryEINTR(shutdown)(sockets->second_fd(), SHUT_WR),
+              SyscallSucceedsWithValue(0));
+  t.Join();
+
+  ASSERT_THAT(RetryEINTR(read)(sockets->second_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
 TEST_P(TCPSocketPairTest, ClosedReadNonBlockingSocket) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -696,5 +722,72 @@ TEST_P(TCPSocketPairTest, SetCongestionControlFailsForUnsupported) {
   EXPECT_EQ(0, memcmp(got_cc, old_cc, sizeof(old_cc)));
 }
 
+// Linux and Netstack both default to a 60s TCP_LINGER2 timeout.
+constexpr int kDefaultTCPLingerTimeout = 60;
+
+TEST_P(TCPSocketPairTest, TCPLingerTimeoutDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kDefaultTCPLingerTimeout);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutZeroOrLess) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kZero = 0;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &kZero,
+                         sizeof(kZero)),
+              SyscallSucceedsWithValue(0));
+
+  constexpr int kNegative = -1234;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2,
+                         &kNegative, sizeof(kNegative)),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutAboveDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Values above the net.ipv4.tcp_fin_timeout are capped to tcp_fin_timeout
+  // on linux (defaults to 60 seconds on linux).
+  constexpr int kAboveDefault = kDefaultTCPLingerTimeout + 1;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2,
+                         &kAboveDefault, sizeof(kAboveDefault)),
+              SyscallSucceedsWithValue(0));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kDefaultTCPLingerTimeout);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Values above the net.ipv4.tcp_fin_timeout are capped to tcp_fin_timeout
+  // on linux (defaults to 60 seconds on linux).
+  constexpr int kTCPLingerTimeout = kDefaultTCPLingerTimeout - 1;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2,
+                         &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+              SyscallSucceedsWithValue(0));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kTCPLingerTimeout);
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 773071680021a2fb985f3a3af7e9f65cdc1bd1ed Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Mon, 11 Nov 2019 14:13:50 -0800
Subject: Make `connect` on socket returned by `accept` correctly error out
 with EISCONN

PiperOrigin-RevId: 279814493
---
 pkg/tcpip/transport/tcp/accept.go   |  2 ++
 pkg/tcpip/transport/tcp/tcp_test.go |  3 +++
 test/syscalls/linux/tcp_socket.cc   | 13 +++++++++++++
 3 files changed, 18 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 0e8e0a2b4..f24b51b91 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -300,6 +300,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	ep.mu.Lock()
 	ep.stack.Stats().TCP.CurrentEstablished.Increment()
 	ep.state = StateEstablished
+	ep.isConnectNotified = true
 	ep.mu.Unlock()
 
 	// Update the receive window scaling. We can't do it before the
@@ -539,6 +540,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// Switch state to connected.
 		n.stack.Stats().TCP.CurrentEstablished.Increment()
 		n.state = StateEstablished
+		n.isConnectNotified = true
 
 		// Do the delivery in a separate goroutine so
 		// that we don't block the listen loop in case
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 0c1704d74..84579ce52 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -4599,6 +4599,9 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 	if got, want := tcp.EndpointState(aep.State()), tcp.StateEstablished; got != want {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
+	if err := aep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrAlreadyConnected {
+		t.Errorf("Unexpected error attempting to call connect on an established endpoint, got: %v, want: %v", err, tcpip.ErrAlreadyConnected)
+	}
 	// Listening endpoint remains in listen state.
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 277d6835a..bfc77ffc2 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -130,6 +130,19 @@ void TcpSocketTest::TearDown() {
   }
 }
 
+TEST_P(TcpSocketTest, ConnectOnEstablishedConnection) {
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  ASSERT_THAT(
+      connect(s_, reinterpret_cast<const struct sockaddr*>(&addr), addrlen),
+      SyscallFailsWithErrno(EISCONN));
+  ASSERT_THAT(
+      connect(t_, reinterpret_cast<const struct sockaddr*>(&addr), addrlen),
+      SyscallFailsWithErrno(EISCONN));
+}
+
 TEST_P(TcpSocketTest, DataCoalesced) {
   char buf[10];
 
-- 
cgit v1.2.3


From 2b0e4dc6aa7fb8a3f619220b72537a8fff2f95b4 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 11 Nov 2019 15:49:49 -0800
Subject: Remove obsolete TODO. This is now fixed.

PiperOrigin-RevId: 279835100
---
 test/syscalls/linux/tcp_socket.cc | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index bfc77ffc2..99863b0ed 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -425,6 +425,11 @@ TEST_P(TcpSocketTest, PollWithFullBufferBlocks) {
   }
   // The last error should have been EWOULDBLOCK.
   ASSERT_EQ(errno, EWOULDBLOCK);
+
+  // Now polling on the FD with a timeout should return 0 corresponding to no
+  // FDs ready.
+  struct pollfd poll_fd = {s_, POLLOUT, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10), SyscallSucceedsWithValue(0));
 }
 
 TEST_P(TcpSocketTest, MsgTrunc) {
-- 
cgit v1.2.3


From b82bd24f9495435cadd2713db829b19ce8fcce9d Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 11 Nov 2019 18:34:28 -0800
Subject: Update ephemeral port reservation tests.

The existing tests which are disabled on gVisor are failing because we default
to SO_REUSEADDR being enabled for TCP sockets. Update the test comments.

Also add new tests for enabled SO_REUSEADDR.

PiperOrigin-RevId: 279862275
---
 test/syscalls/linux/socket_inet_loopback.cc | 223 ++++++++++++++++++++++++++--
 1 file changed, 212 insertions(+), 11 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 2eeee352e..96a1731cf 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -1156,10 +1156,9 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) {
     sockaddr_storage addr_dual = test_addr_dual.addr;
     const FileDescriptor fd_dual = ASSERT_NO_ERRNO_AND_VALUE(
         Socket(test_addr_dual.family(), param.type, 0));
-    int one = 1;
-    EXPECT_THAT(
-        setsockopt(fd_dual.get(), IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)),
-        SyscallSucceeds());
+    EXPECT_THAT(setsockopt(fd_dual.get(), IPPROTO_IPV6, IPV6_V6ONLY,
+                           &kSockOptOn, sizeof(kSockOptOn)),
+                SyscallSucceeds());
     ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
                      test_addr_dual.addr_len),
                 SyscallSucceeds());
@@ -1207,7 +1206,8 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) {
 TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME(b/114268588)
+  // FIXME(b/76031995): Support disabling SO_REUSEADDR for TCP sockets and make
+  // it disabled by default.
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
@@ -1305,10 +1305,76 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
   }
 }
 
+TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReservedReuseAddr) {
+  auto const& param = GetParam();
+
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_DGRAM);
+
+  // Bind the v6 loopback on a dual stack socket.
+  TestAddress const& test_addr = V6Loopback();
+  sockaddr_storage bound_addr = test_addr.addr;
+  const FileDescriptor bound_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Listen iff TCP.
+  if (param.type == SOCK_STREAM) {
+    ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+  }
+
+  // Get the port that we bound.
+  socklen_t bound_addr_len = test_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                  &bound_addr_len),
+      SyscallSucceeds());
+
+  // Connect to bind an ephemeral port.
+  const FileDescriptor connected_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(connect(connected_fd.get(),
+                      reinterpret_cast<sockaddr*>(&bound_addr), bound_addr_len),
+              SyscallSucceeds());
+
+  // Get the ephemeral port.
+  sockaddr_storage connected_addr = {};
+  socklen_t connected_addr_len = sizeof(connected_addr);
+  ASSERT_THAT(getsockname(connected_fd.get(),
+                          reinterpret_cast<sockaddr*>(&connected_addr),
+                          &connected_addr_len),
+              SyscallSucceeds());
+  uint16_t const ephemeral_port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+  // Verify that we actually got an ephemeral port.
+  ASSERT_NE(ephemeral_port, 0);
+
+  // Verify that the ephemeral port is not reserved.
+  const FileDescriptor checking_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+           connected_addr_len),
+      SyscallSucceeds());
+}
+
 TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME(b/114268588)
+  // FIXME(b/76031995): Support disabling SO_REUSEADDR for TCP sockets and make
+  // it disabled by default.
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
@@ -1408,9 +1474,8 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
       // v6-only socket.
       const FileDescriptor fd_v6_only_any = ASSERT_NO_ERRNO_AND_VALUE(
           Socket(test_addr_v6_any.family(), param.type, 0));
-      int one = 1;
       EXPECT_THAT(setsockopt(fd_v6_only_any.get(), IPPROTO_IPV6, IPV6_V6ONLY,
-                             &one, sizeof(one)),
+                             &kSockOptOn, sizeof(kSockOptOn)),
                   SyscallSucceeds());
       ret =
           bind(fd_v6_only_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
@@ -1429,10 +1494,78 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
   }
 }
 
+TEST_P(SocketMultiProtocolInetLoopbackTest,
+       V4MappedEphemeralPortReservedResueAddr) {
+  auto const& param = GetParam();
+
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_DGRAM);
+
+  // Bind the v4 loopback on a dual stack socket.
+  TestAddress const& test_addr = V4MappedLoopback();
+  sockaddr_storage bound_addr = test_addr.addr;
+  const FileDescriptor bound_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len),
+              SyscallSucceeds());
+
+  ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Listen iff TCP.
+  if (param.type == SOCK_STREAM) {
+    ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+  }
+
+  // Get the port that we bound.
+  socklen_t bound_addr_len = test_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                  &bound_addr_len),
+      SyscallSucceeds());
+
+  // Connect to bind an ephemeral port.
+  const FileDescriptor connected_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(connect(connected_fd.get(),
+                      reinterpret_cast<sockaddr*>(&bound_addr), bound_addr_len),
+              SyscallSucceeds());
+
+  // Get the ephemeral port.
+  sockaddr_storage connected_addr = {};
+  socklen_t connected_addr_len = sizeof(connected_addr);
+  ASSERT_THAT(getsockname(connected_fd.get(),
+                          reinterpret_cast<sockaddr*>(&connected_addr),
+                          &connected_addr_len),
+              SyscallSucceeds());
+  uint16_t const ephemeral_port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+  // Verify that we actually got an ephemeral port.
+  ASSERT_NE(ephemeral_port, 0);
+
+  // Verify that the ephemeral port is not reserved.
+  const FileDescriptor checking_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+           connected_addr_len),
+      SyscallSucceeds());
+}
+
 TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME(b/114268588)
+  // FIXME(b/76031995): Support disabling SO_REUSEADDR for TCP sockets and make
+  // it disabled by default.
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
@@ -1533,9 +1666,8 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
       // v6-only socket.
       const FileDescriptor fd_v6_only_any = ASSERT_NO_ERRNO_AND_VALUE(
           Socket(test_addr_v6_any.family(), param.type, 0));
-      int one = 1;
       EXPECT_THAT(setsockopt(fd_v6_only_any.get(), IPPROTO_IPV6, IPV6_V6ONLY,
-                             &one, sizeof(one)),
+                             &kSockOptOn, sizeof(kSockOptOn)),
                   SyscallSucceeds());
       ret =
           bind(fd_v6_only_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
@@ -1554,6 +1686,75 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
   }
 }
 
+TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) {
+  auto const& param = GetParam();
+
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_DGRAM);
+
+  // Bind the v4 loopback on a v4 socket.
+  TestAddress const& test_addr = V4Loopback();
+  sockaddr_storage bound_addr = test_addr.addr;
+  const FileDescriptor bound_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+  ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                   test_addr.addr_len),
+              SyscallSucceeds());
+
+  // Listen iff TCP.
+  if (param.type == SOCK_STREAM) {
+    ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+  }
+
+  // Get the port that we bound.
+  socklen_t bound_addr_len = test_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+                  &bound_addr_len),
+      SyscallSucceeds());
+
+  // Connect to bind an ephemeral port.
+  const FileDescriptor connected_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+  ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(connect(connected_fd.get(),
+                      reinterpret_cast<sockaddr*>(&bound_addr), bound_addr_len),
+              SyscallSucceeds());
+
+  // Get the ephemeral port.
+  sockaddr_storage connected_addr = {};
+  socklen_t connected_addr_len = sizeof(connected_addr);
+  ASSERT_THAT(getsockname(connected_fd.get(),
+                          reinterpret_cast<sockaddr*>(&connected_addr),
+                          &connected_addr_len),
+              SyscallSucceeds());
+  uint16_t const ephemeral_port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+  // Verify that we actually got an ephemeral port.
+  ASSERT_NE(ephemeral_port, 0);
+
+  // Verify that the ephemeral port is not reserved.
+  const FileDescriptor checking_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+  ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+           connected_addr_len),
+      SyscallSucceeds());
+}
+
 TEST_P(SocketMultiProtocolInetLoopbackTest, PortReuseTwoSockets) {
   auto const& param = GetParam();
   TestAddress const& test_addr = V4Loopback();
-- 
cgit v1.2.3


From 57a2a5ea3359e0879f5e4cc40fdb9ad973c689a8 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 12 Nov 2019 14:02:53 -0800
Subject: Add tests for SO_REUSEADDR and SO_REUSEPORT.

* Basic tests for the SO_REUSEADDR and SO_REUSEPORT options.
* SO_REUSEADDR functional tests for TCP and UDP.
* SO_REUSEADDR and SO_REUSEPORT interaction tests for UDP.
* Stubbed support for UDP getsockopt(SO_REUSEADDR).

PiperOrigin-RevId: 280049265
---
 pkg/tcpip/transport/udp/endpoint.go                |    4 +
 test/syscalls/linux/BUILD                          |    1 +
 test/syscalls/linux/socket_ip_udp_generic.cc       |  128 ++-
 test/syscalls/linux/socket_ipv4_udp_unbound.cc     | 1116 ++++++++++++--------
 test/syscalls/linux/socket_ipv4_udp_unbound.h      |    4 +-
 .../linux/socket_ipv4_udp_unbound_loopback.cc      |   13 +-
 test/syscalls/linux/socket_test_util.h             |    3 +
 7 files changed, 789 insertions(+), 480 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 5270f24df..dda7af910 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -732,6 +732,10 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = tcpip.MulticastLoopOption(v)
 		return nil
 
+	case *tcpip.ReuseAddressOption:
+		*o = 0
+		return nil
+
 	case *tcpip.ReusePortOption:
 		e.mu.RLock()
 		v := e.reusePort
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index f8b8cb724..6345ea28c 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2234,6 +2234,7 @@ cc_library(
         ":ip_socket_test_util",
         ":socket_test_util",
         "//test/util:test_util",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 2a4ed04a5..66eb68857 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -35,7 +35,7 @@ TEST_P(UDPSocketPairTest, MulticastTTLDefault) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -52,7 +52,7 @@ TEST_P(UDPSocketPairTest, SetUDPMulticastTTLMin) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -69,7 +69,7 @@ TEST_P(UDPSocketPairTest, SetUDPMulticastTTLMax) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -91,7 +91,7 @@ TEST_P(UDPSocketPairTest, SetUDPMulticastTTLNegativeOne) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -126,7 +126,7 @@ TEST_P(UDPSocketPairTest, SetUDPMulticastTTLChar) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -147,7 +147,7 @@ TEST_P(UDPSocketPairTest, MulticastLoopDefault) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -163,7 +163,7 @@ TEST_P(UDPSocketPairTest, SetMulticastLoop) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -173,7 +173,7 @@ TEST_P(UDPSocketPairTest, SetMulticastLoop) {
                          &kSockOptOn, sizeof(kSockOptOn)),
               SyscallSucceeds());
 
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -192,7 +192,7 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
 
   int get = -1;
   socklen_t get_len = sizeof(get);
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
@@ -202,12 +202,120 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
                          &kSockOptOnChar, sizeof(kSockOptOnChar)),
               SyscallSucceeds());
 
-  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &get, &get_len),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
   EXPECT_EQ(get, kSockOptOn);
 }
 
+TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, SetReuseAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, ReusePortDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, SetReusePort) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, SetReuseAddrReusePort) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index b828b6844..00dc24928 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -18,9 +18,11 @@
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+
 #include <cstdio>
 
 #include "gtest/gtest.h"
+#include "absl/memory/memory.h"
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
@@ -51,26 +53,27 @@ TestAddress V4Broadcast() {
 
 // Check that packets are not received without a group membership. Default send
 // interface configured by bind.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNoGroup) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNoGroup) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
   EXPECT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
 
   // Bind the second FD to the v4 any address. If multicast worked like unicast,
   // this would ensure that we get the packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -82,33 +85,33 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNoGroup) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
 
 // Check that not setting a default send interface prevents multicast packets
 // from being sent. Group membership interface configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the second FD to the v4 any address to ensure that we can receive any
   // unicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -118,8 +121,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -128,27 +131,27 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallFailsWithErrno(ENETUNREACH));
+  EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallFailsWithErrno(ENETUNREACH));
 }
 
 // Check that not setting a default send interface prevents multicast packets
 // from being sent. Group membership interface configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNicNoDefaultSendIf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNicNoDefaultSendIf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the second FD to the v4 any address to ensure that we can receive any
   // unicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -158,8 +161,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNicNoDefaultSendIf) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -168,35 +171,35 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNicNoDefaultSendIf) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallFailsWithErrno(ENETUNREACH));
+  EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallFailsWithErrno(ENETUNREACH));
 }
 
 // Check that multicast works when the default send interface is configured by
 // bind and the group membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
   ASSERT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -206,8 +209,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -216,43 +219,42 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
 // Check that multicast works when the default send interface is configured by
 // bind and the group membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNic) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
   ASSERT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -262,8 +264,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -272,17 +274,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -290,25 +290,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -318,8 +319,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -328,17 +329,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -346,25 +345,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNic) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -374,8 +374,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -384,17 +384,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -402,25 +400,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrConnect) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -430,8 +429,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -439,22 +438,20 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   ASSERT_THAT(
-      RetryEINTR(connect)(sockets->first_fd(),
+      RetryEINTR(connect)(socket1->get(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
       SyscallSucceeds());
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -462,25 +459,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicConnect) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -490,8 +488,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -499,22 +497,20 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   ASSERT_THAT(
-      RetryEINTR(connect)(sockets->first_fd(),
+      RetryEINTR(connect)(socket1->get(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
       SyscallSucceeds());
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -522,25 +518,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the first FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -550,8 +547,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelf) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -560,17 +557,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelf) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -578,25 +573,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelf) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the first FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -606,8 +602,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelf) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -616,17 +612,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelf) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -634,25 +628,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelf) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfConnect) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelfConnect) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the first FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -662,8 +657,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfConnect) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -671,20 +666,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfConnect) {
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   EXPECT_THAT(
-      RetryEINTR(connect)(sockets->first_fd(),
+      RetryEINTR(connect)(socket1->get(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
       SyscallSucceeds());
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf),
+  EXPECT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
@@ -692,25 +686,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfConnect) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfConnect) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfConnect) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the first FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -720,8 +715,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfConnect) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -729,20 +724,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfConnect) {
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   ASSERT_THAT(
-      RetryEINTR(connect)(sockets->first_fd(),
+      RetryEINTR(connect)(socket1->get(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
       SyscallSucceeds());
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf),
+  EXPECT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
@@ -750,29 +744,30 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfConnect) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &kSockOptOff, sizeof(kSockOptOff)),
               SyscallSucceeds());
 
   // Bind the first FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -782,8 +777,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -792,17 +787,15 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
@@ -810,29 +803,30 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
 // Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfNoLoop) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfNoLoop) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
                          &kSockOptOff, sizeof(kSockOptOff)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -842,8 +836,8 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfNoLoop) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -852,57 +846,57 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfNoLoop) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(
-      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
-      SyscallSucceedsWithValue(sizeof(recv_buf)));
+  ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
 
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
 // Check that dropping a group membership that does not exist fails.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastInvalidDrop) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastInvalidDrop) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Unregister from a membership that we didn't have.
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
 // Check that dropping a group membership prevents multicast packets from being
 // delivered. Default send address configured by bind and group membership
 // interface configured by address.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastDropAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
   EXPECT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -912,11 +906,11 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropAddr) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -925,15 +919,14 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropAddr) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
@@ -941,26 +934,27 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropAddr) {
 // Check that dropping a group membership prevents multicast packets from being
 // delivered. Default send address configured by bind and group membership
 // interface configured by NIC ID.
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropNic) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastDropNic) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
   EXPECT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  EXPECT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -970,11 +964,11 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropNic) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet.
@@ -983,50 +977,53 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropNic) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&send_addr.addr),
-                         send_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we did not receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  EXPECT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfZero) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfZero) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn iface = {};
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfInvalidNic) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfInvalidNic) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn iface = {};
   iface.imr_ifindex = -1;
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfInvalidAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfInvalidAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreq iface = {};
   iface.imr_interface.s_addr = inet_addr("255.255.255");
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetShort) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetShort) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Create a valid full-sized request.
   ip_mreqn iface = {};
@@ -1034,29 +1031,31 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetShort) {
 
   // Send an optlen of 1 to check that optlen is enforced.
   EXPECT_THAT(
-      setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &iface, 1),
+      setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, 1),
       SyscallFailsWithErrno(EINVAL));
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfDefault) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfDefault) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   in_addr get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
   EXPECT_EQ(size, sizeof(get));
   EXPECT_EQ(get.s_addr, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfDefaultReqn) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfDefaultReqn) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
 
   // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
@@ -1071,19 +1070,20 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfDefaultReqn) {
   EXPECT_EQ(get.imr_ifindex, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetAddrGetReqn) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetAddrGetReqn) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   in_addr set = {};
   set.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   ip_mreqn get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
 
   // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
@@ -1095,19 +1095,20 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetAddrGetReqn) {
   EXPECT_EQ(get.imr_ifindex, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetReqAddrGetReqn) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetReqAddrGetReqn) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreq set = {};
   set.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   ip_mreqn get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
 
   // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
@@ -1119,19 +1120,20 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetReqAddrGetReqn) {
   EXPECT_EQ(get.imr_ifindex, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetNicGetReqn) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetNicGetReqn) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn set = {};
   set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   ip_mreqn get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
   EXPECT_EQ(size, sizeof(in_addr));
   EXPECT_EQ(get.imr_multiaddr.s_addr, 0);
@@ -1139,87 +1141,93 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetNicGetReqn) {
   EXPECT_EQ(get.imr_ifindex, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   in_addr set = {};
   set.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   in_addr get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
 
   EXPECT_EQ(size, sizeof(get));
   EXPECT_EQ(get.s_addr, set.s_addr);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetReqAddr) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetReqAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreq set = {};
   set.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   in_addr get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
 
   EXPECT_EQ(size, sizeof(get));
   EXPECT_EQ(get.s_addr, set.imr_interface.s_addr);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetNic) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetNic) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn set = {};
   set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
                          sizeof(set)),
               SyscallSucceeds());
 
   in_addr get = {};
   socklen_t size = sizeof(get);
   ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
       SyscallSucceeds());
   EXPECT_EQ(size, sizeof(get));
   EXPECT_EQ(get.s_addr, 0);
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, TestJoinGroupNoIf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestJoinGroupNoIf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallFailsWithErrno(ENODEV));
 }
 
-TEST_P(IPv4UDPUnboundSocketPairTest, TestJoinGroupInvalidIf) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestJoinGroupInvalidIf) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn group = {};
   group.imr_address.s_addr = inet_addr("255.255.255");
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallFailsWithErrno(ENODEV));
 }
 
 // Check that multiple memberships are not allowed on the same socket.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestMultipleJoinsOnSingleSocket) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-  auto fd = sockets->first_fd();
+TEST_P(IPv4UDPUnboundSocketTest, TestMultipleJoinsOnSingleSocket) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto fd = socket1->get();
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
@@ -1234,41 +1242,44 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestMultipleJoinsOnSingleSocket) {
 }
 
 // Check that two sockets can join the same multicast group at the same time.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestTwoSocketsJoinSameMulticastGroup) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestTwoSocketsJoinSameMulticastGroup) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Drop the membership twice on each socket, the second call for each socket
   // should fail.
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallFailsWithErrno(EADDRNOTAVAIL));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
-                         &group, sizeof(group)),
+  EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
 // Check that two sockets can join the same multicast group at the same time,
 // and both will receive data on it.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestMcastReceptionOnTwoSockets) {
+TEST_P(IPv4UDPUnboundSocketTest, TestMcastReceptionOnTwoSockets) {
   std::unique_ptr<SocketPair> socket_pairs[2] = {
-      ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()),
-      ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair())};
+      absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+                                      ASSERT_NO_ERRNO_AND_VALUE(NewSocket())),
+      absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+                                      ASSERT_NO_ERRNO_AND_VALUE(NewSocket()))};
 
   ip_mreq iface = {}, group = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
@@ -1338,11 +1349,12 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestMcastReceptionOnTwoSockets) {
 // Check that on two sockets that joined a group and listen on ANY, dropping
 // memberships one by one will continue to deliver packets to both sockets until
 // both memberships have been dropped.
-TEST_P(IPv4UDPUnboundSocketPairTest,
-       TestMcastReceptionWhenDroppingMemberships) {
+TEST_P(IPv4UDPUnboundSocketTest, TestMcastReceptionWhenDroppingMemberships) {
   std::unique_ptr<SocketPair> socket_pairs[2] = {
-      ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()),
-      ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair())};
+      absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+                                      ASSERT_NO_ERRNO_AND_VALUE(NewSocket())),
+      absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+                                      ASSERT_NO_ERRNO_AND_VALUE(NewSocket()))};
 
   ip_mreq iface = {}, group = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
@@ -1437,18 +1449,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest,
 
 // Check that a receiving socket can bind to the multicast address before
 // joining the group and receive data once the group has been joined.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenJoinThenReceive) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenJoinThenReceive) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind second socket (receiver) to the multicast address.
   auto receiver_addr = V4Multicast();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   // Update receiver_addr with the correct port number.
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -1458,30 +1471,29 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenJoinThenReceive) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
-                         &group, sizeof(group)),
+  ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
               SyscallSucceeds());
 
   // Send a multicast packet on the first socket out the loopback interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
   auto sendto_addr = V4Multicast();
   reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&sendto_addr.addr),
-                         sendto_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+                                 sendto_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallSucceedsWithValue(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
@@ -1489,18 +1501,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenJoinThenReceive) {
 
 // Check that a receiving socket can bind to the multicast address and won't
 // receive multicast data if it hasn't joined the group.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenNoJoinThenNoReceive) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenNoJoinThenNoReceive) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind second socket (receiver) to the multicast address.
   auto receiver_addr = V4Multicast();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   // Update receiver_addr with the correct port number.
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -1509,40 +1522,40 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenNoJoinThenNoReceive) {
   // Send a multicast packet on the first socket out the loopback interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
-                         &iface, sizeof(iface)),
+  ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+                         sizeof(iface)),
               SyscallSucceeds());
   auto sendto_addr = V4Multicast();
   reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&sendto_addr.addr),
-                         sendto_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+                                 sendto_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we don't receive the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallFailsWithErrno(EAGAIN));
 }
 
 // Check that a socket can bind to a multicast address and still send out
 // packets.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenSend) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenSend) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind second socket (receiver) to the ANY address.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -1551,11 +1564,11 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenSend) {
   // Bind the first socket (sender) to the multicast address.
   auto sender_addr = V4Multicast();
   ASSERT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
   socklen_t sender_addr_len = sender_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&sender_addr.addr),
                           &sender_addr_len),
               SyscallSucceeds());
@@ -1567,15 +1580,14 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenSend) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&sendto_addr.addr),
-                         sendto_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+                                 sendto_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallSucceedsWithValue(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
@@ -1583,46 +1595,46 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToMcastThenSend) {
 
 // Check that a receiving socket can bind to the broadcast address and receive
 // broadcast packets.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToBcastThenReceive) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToBcastThenReceive) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind second socket (receiver) to the broadcast address.
   auto receiver_addr = V4Broadcast();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
   EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
 
   // Send a broadcast packet on the first socket out the loopback interface.
-  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_BROADCAST,
-                         &kSockOptOn, sizeof(kSockOptOn)),
+  EXPECT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+                         sizeof(kSockOptOn)),
               SyscallSucceedsWithValue(0));
   // Note: Binding to the loopback interface makes the broadcast go out of it.
   auto sender_bind_addr = V4Loopback();
-  ASSERT_THAT(bind(sockets->first_fd(),
-                   reinterpret_cast<sockaddr*>(&sender_bind_addr.addr),
-                   sender_bind_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_bind_addr.addr),
+           sender_bind_addr.addr_len),
+      SyscallSucceeds());
   auto sendto_addr = V4Broadcast();
   reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&sendto_addr.addr),
-                         sendto_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+                                 sendto_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the multicast packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallSucceedsWithValue(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
@@ -1630,17 +1642,18 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToBcastThenReceive) {
 
 // Check that a socket can bind to the broadcast address and still send out
 // packets.
-TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToBcastThenSend) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToBcastThenSend) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
 
   // Bind second socket (receiver) to the ANY address.
   auto receiver_addr = V4Any();
-  ASSERT_THAT(bind(sockets->second_fd(),
-                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
-                   receiver_addr.addr_len),
-              SyscallSucceeds());
+  ASSERT_THAT(
+      bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(socket2->get(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -1649,11 +1662,11 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToBcastThenSend) {
   // Bind the first socket (sender) to the broadcast address.
   auto sender_addr = V4Broadcast();
   ASSERT_THAT(
-      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+      bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
   socklen_t sender_addr_len = sender_addr.addr_len;
-  ASSERT_THAT(getsockname(sockets->first_fd(),
+  ASSERT_THAT(getsockname(socket1->get(),
                           reinterpret_cast<sockaddr*>(&sender_addr.addr),
                           &sender_addr_len),
               SyscallSucceeds());
@@ -1665,19 +1678,202 @@ TEST_P(IPv4UDPUnboundSocketPairTest, TestBindToBcastThenSend) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  ASSERT_THAT(
-      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
-                         reinterpret_cast<sockaddr*>(&sendto_addr.addr),
-                         sendto_addr.addr_len),
-      SyscallSucceedsWithValue(sizeof(send_buf)));
+  ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+                                 sendto_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
 
   // Check that we received the packet.
   char recv_buf[sizeof(send_buf)] = {};
-  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+  ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
                                MSG_DONTWAIT),
               SyscallSucceedsWithValue(sizeof(recv_buf)));
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
+// Check that SO_REUSEADDR always delivers to the most recently bound socket.
+TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrDistribution) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  std::vector<std::unique_ptr<FileDescriptor>> sockets;
+  sockets.emplace_back(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()));
+
+  ASSERT_THAT(setsockopt(sockets[0]->get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(sockets[0]->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(sockets[0]->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  constexpr int kMessageSize = 200;
+
+  for (int i = 0; i < 10; i++) {
+    // Add a new receiver.
+    sockets.emplace_back(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()));
+    auto& last = sockets.back();
+    ASSERT_THAT(setsockopt(last->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                           sizeof(kSockOptOn)),
+                SyscallSucceeds());
+    ASSERT_THAT(bind(last->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                     addr.addr_len),
+                SyscallSucceeds());
+
+    // Send a new message to the SO_REUSEADDR group. We use a new socket each
+    // time so that a new ephemeral port will be used each time. This ensures
+    // that we aren't doing REUSEPORT-like hash load blancing.
+    auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+    char send_buf[kMessageSize];
+    RandomizeBuffer(send_buf, sizeof(send_buf));
+    EXPECT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                   reinterpret_cast<sockaddr*>(&addr.addr),
+                                   addr.addr_len),
+                SyscallSucceedsWithValue(sizeof(send_buf)));
+
+    // Verify that the most recent socket got the message. We don't expect any
+    // of the other sockets to have received it, but we will check that later.
+    char recv_buf[sizeof(send_buf)] = {};
+    EXPECT_THAT(
+        RetryEINTR(recv)(last->get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+        SyscallSucceedsWithValue(sizeof(send_buf)));
+    EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+  }
+
+  // Verify that no other messages were received.
+  for (auto& socket : sockets) {
+    char recv_buf[kMessageSize] = {};
+    EXPECT_THAT(RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf),
+                                 MSG_DONTWAIT),
+                SyscallFailsWithErrno(EAGAIN));
+  }
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrThenReusePort) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReusePortThenReuseAddr) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+// Check that REUSEPORT takes precedence over REUSEADDR.
+TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
+  auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  ASSERT_THAT(setsockopt(receiver1->get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(receiver1->get(), SOL_SOCKET, SO_REUSEPORT,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(receiver1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(receiver1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind receiver2 to the same address as socket1, also with REUSEADDR and
+  // REUSEPORT.
+  ASSERT_THAT(setsockopt(receiver2->get(), SOL_SOCKET, SO_REUSEADDR,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(receiver2->get(), SOL_SOCKET, SO_REUSEPORT,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(receiver2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  constexpr int kMessageSize = 10;
+
+  for (int i = 0; i < 100; ++i) {
+    // Send a new message to the REUSEADDR/REUSEPORT group. We use a new socket
+    // each time so that a new ephemerial port will be used each time. This
+    // ensures that we cycle through hashes.
+    auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+    char send_buf[kMessageSize] = {};
+    EXPECT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                   reinterpret_cast<sockaddr*>(&addr.addr),
+                                   addr.addr_len),
+                SyscallSucceedsWithValue(sizeof(send_buf)));
+  }
+
+  // Check that both receivers got messages. This checks that we are using load
+  // balancing (REUSEPORT) instead of the most recently bound socket
+  // (REUSEADDR).
+  char recv_buf[kMessageSize] = {};
+  EXPECT_THAT(RetryEINTR(recv)(receiver1->get(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallSucceedsWithValue(kMessageSize));
+  EXPECT_THAT(RetryEINTR(recv)(receiver2->get(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallSucceedsWithValue(kMessageSize));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.h b/test/syscalls/linux/socket_ipv4_udp_unbound.h
index 8e07bfbbf..f64c57645 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.h
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.h
@@ -20,8 +20,8 @@
 namespace gvisor {
 namespace testing {
 
-// Test fixture for tests that apply to pairs of IPv4 UDP sockets.
-using IPv4UDPUnboundSocketPairTest = SocketPairTest;
+// Test fixture for tests that apply to IPv4 UDP sockets.
+using IPv4UDPUnboundSocketTest = SimpleSocketTest;
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
index cb0105471..f121c044d 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
@@ -22,14 +22,11 @@
 namespace gvisor {
 namespace testing {
 
-std::vector<SocketPairKind> GetSocketPairs() {
-  return ApplyVec<SocketPairKind>(
-      IPv4UDPUnboundSocketPair,
-      AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK}));
-}
-
-INSTANTIATE_TEST_SUITE_P(IPv4UDPSockets, IPv4UDPUnboundSocketPairTest,
-                         ::testing::ValuesIn(GetSocketPairs()));
+INSTANTIATE_TEST_SUITE_P(
+    IPv4UDPSockets, IPv4UDPUnboundSocketTest,
+    ::testing::ValuesIn(ApplyVec<SocketKind>(IPv4UDPUnboundSocket,
+                                             AllBitwiseCombinations(List<int>{
+                                                 0, SOCK_NONBLOCK}))));
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index be38907c2..2dbb8bed3 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -114,6 +114,9 @@ class FDSocketPair : public SocketPair {
  public:
   FDSocketPair(int first_fd, int second_fd)
       : first_(first_fd), second_(second_fd) {}
+  FDSocketPair(std::unique_ptr<FileDescriptor> first_fd,
+               std::unique_ptr<FileDescriptor> second_fd)
+      : first_(first_fd->release()), second_(second_fd->release()) {}
 
   int first_fd() const override { return first_.get(); }
   int second_fd() const override { return second_.get(); }
-- 
cgit v1.2.3


From 2c6c9af904c99371fe4381517375cd114917db59 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 12 Nov 2019 20:37:40 -0800
Subject: Add UDP SO_REUSEADDR/SO_REUSEPORT conversion tests.

Add additional tests for UDP SO_REUSEADDR and SO_REUSEPORT interaction.

If all existing all currently bound sockets as well as the current binding
socket have SO_REUSEADDR, or if all existing all currently bound sockets as
well as the current binding socket have SO_REUSEPORT, binding a currently bound
address is allowed. This seems odd since it means that the
SO_REUSEADDR/SO_REUSEPORT behavior can change with the binding of additional
sockets.

PiperOrigin-RevId: 280116163
---
 test/syscalls/linux/socket_ipv4_udp_unbound.cc | 274 +++++++++++++++++++++++++
 1 file changed, 274 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 00dc24928..6b1af6c17 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -1814,6 +1814,280 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReusePortThenReuseAddr) {
               SyscallFailsWithErrno(EADDRINUSE));
 }
 
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReusePort) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Bind socket3 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Bind socket3 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConversionReversable1) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Close socket2 to revert to just socket1 with REUSEADDR and REUSEPORT.
+  socket2->reset();
+
+  // Bind socket3 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConversionReversable2) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Close socket2 to revert to just socket1 with REUSEADDR and REUSEPORT.
+  socket2->reset();
+
+  // Bind socket3 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindDoubleReuseAddrReusePortThenReusePort) {
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, also with REUSEADDR and
+  // REUSEPORT.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Bind socket3 to the same address as socket1, only with REUSEPORT.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindDoubleReuseAddrReusePortThenReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind socket1 with REUSEADDR and REUSEPORT.
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(socket1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Bind socket2 to the same address as socket1, also with REUSEADDR and
+  // REUSEPORT.
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+
+  // Bind socket3 to the same address as socket1, only with REUSEADDR.
+  ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+  ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+}
+
 // Check that REUSEPORT takes precedence over REUSEADDR.
 TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
   auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
-- 
cgit v1.2.3


From 1e55eb3800a60c1a1118b84f2534b78481702f38 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 13 Nov 2019 15:34:47 -0800
Subject: test/syscalls/proc: check an return code of waitid

PiperOrigin-RevId: 280295208
---
 test/syscalls/linux/proc.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index e4c030bbb..512de5ee0 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -183,7 +183,8 @@ PosixError WithSubprocess(SubprocessCallback const& running,
   siginfo_t info;
   // Wait until the child process has exited (WEXITED flag) but don't
   // reap the child (WNOWAIT flag).
-  waitid(P_PID, child_pid, &info, WNOWAIT | WEXITED);
+  EXPECT_THAT(waitid(P_PID, child_pid, &info, WNOWAIT | WEXITED),
+              SyscallSucceeds());
 
   if (zombied) {
     // Arg of "Z" refers to a Zombied Process.
-- 
cgit v1.2.3


From 339536de5eefe782813aabae4aeeb312b3c4dde7 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 14 Nov 2019 15:55:07 -0800
Subject: Check that a file is a regular file with open(O_TRUNC).

It was possible to panic the sentry by opening a cache revalidating folder with
O_TRUNC|O_CREAT.

Avoids breaking php tests.

PiperOrigin-RevId: 280533213
---
 pkg/sentry/fs/inode.go                |  4 ++++
 pkg/sentry/fs/tty/master.go           |  1 +
 pkg/sentry/fs/tty/slave.go            |  1 +
 pkg/sentry/syscalls/linux/sys_file.go |  9 +++++----
 test/syscalls/linux/open.cc           | 22 ++++++++++++++++++++++
 test/syscalls/linux/open_create.cc    | 24 ++++++++++++++++++++++++
 test/syscalls/linux/pty.cc            | 20 +++++++++++++++++++-
 test/util/pty_util.cc                 | 10 +++++++++-
 test/util/pty_util.h                  |  3 +++
 9 files changed, 88 insertions(+), 6 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index f4ddfa406..2d43dff1d 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -344,6 +344,10 @@ func (i *Inode) SetTimestamps(ctx context.Context, d *Dirent, ts TimeSpec) error
 
 // Truncate calls i.InodeOperations.Truncate with i as the Inode.
 func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error {
+	if IsDir(i.StableAttr) {
+		return syserror.EISDIR
+	}
+
 	if i.overlay != nil {
 		return overlayTruncate(ctx, i.overlay, d, size)
 	}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 19b7557d5..bc56be696 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -32,6 +32,7 @@ import (
 // +stateify savable
 type masterInodeOperations struct {
 	fsutil.SimpleFileInode
+	fsutil.InodeNoopTruncate
 
 	// d is the containing dir.
 	d *dirInodeOperations
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 944c4ada1..4cbea0367 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -31,6 +31,7 @@ import (
 // +stateify savable
 type slaveInodeOperations struct {
 	fsutil.SimpleFileInode
+	fsutil.InodeNoopTruncate
 
 	// d is the containing dir.
 	d *dirInodeOperations
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index b9a8e3e21..167c2b60b 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -169,10 +169,11 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint
 			if dirPath {
 				return syserror.ENOTDIR
 			}
-			if flags&linux.O_TRUNC != 0 {
-				if err := d.Inode.Truncate(t, d, 0); err != nil {
-					return err
-				}
+		}
+
+		if flags&linux.O_TRUNC != 0 {
+			if err := d.Inode.Truncate(t, d, 0); err != nil {
+				return err
 			}
 		}
 
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index 2b1df52ce..267ae19f6 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -73,6 +73,28 @@ class OpenTest : public FileTest {
   const std::string test_data_ = "hello world\n";
 };
 
+TEST_F(OpenTest, OTrunc) {
+  auto dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+  ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+  ASSERT_THAT(open(dirpath.c_str(), O_TRUNC, 0666),
+              SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(OpenTest, OTruncAndReadOnlyDir) {
+  auto dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+  ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+  ASSERT_THAT(open(dirpath.c_str(), O_TRUNC | O_RDONLY, 0666),
+              SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(OpenTest, OTruncAndReadOnlyFile) {
+  auto dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncfile");
+  const FileDescriptor existing =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dirpath.c_str(), O_RDWR | O_CREAT, 0666));
+  const FileDescriptor otrunc = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(dirpath.c_str(), O_TRUNC | O_RDONLY, 0666));
+}
+
 TEST_F(OpenTest, ReadOnly) {
   char buf;
   const FileDescriptor ro_file =
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index e5a85ef9d..431733dbe 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -88,6 +88,30 @@ TEST(CreateTest, CreateExclusively) {
               SyscallFailsWithErrno(EEXIST));
 }
 
+TEST(CreateTeast, CreatWithOTrunc) {
+  std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+  ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+  ASSERT_THAT(open(dirpath.c_str(), O_CREAT | O_TRUNC, 0666),
+              SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(CreateTeast, CreatDirWithOTruncAndReadOnly) {
+  std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+  ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+  ASSERT_THAT(open(dirpath.c_str(), O_CREAT | O_TRUNC | O_RDONLY, 0666),
+              SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(CreateTeast, CreatFileWithOTruncAndReadOnly) {
+  std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncfile");
+  int dirfd;
+  ASSERT_THAT(dirfd = open(dirpath.c_str(), O_RDWR | O_CREAT, 0666),
+              SyscallSucceeds());
+  ASSERT_THAT(open(dirpath.c_str(), O_CREAT | O_TRUNC | O_RDONLY, 0666),
+              SyscallSucceeds());
+  ASSERT_THAT(close(dirfd), SyscallSucceeds());
+}
+
 TEST(CreateTest, CreateFailsOnUnpermittedDir) {
   // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
   // always override directory permissions.
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index 99a0df235..dafe64d20 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -70,6 +70,8 @@ constexpr absl::Duration kTimeout = absl::Seconds(20);
 // The maximum line size in bytes returned per read from a pty file.
 constexpr int kMaxLineSize = 4096;
 
+constexpr char kMasterPath[] = "/dev/ptmx";
+
 // glibc defines its own, different, version of struct termios. We care about
 // what the kernel does, not glibc.
 #define KERNEL_NCCS 19
@@ -376,9 +378,25 @@ PosixErrorOr<size_t> PollAndReadFd(int fd, void* buf, size_t count,
   return PosixError(ETIMEDOUT, "Poll timed out");
 }
 
+TEST(PtyTrunc, Truncate) {
+  // Opening PTYs with O_TRUNC shouldn't cause an error, but calls to
+  // (f)truncate should.
+  FileDescriptor master =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(kMasterPath, O_RDWR | O_TRUNC));
+  int n = ASSERT_NO_ERRNO_AND_VALUE(SlaveID(master));
+  std::string spath = absl::StrCat("/dev/pts/", n);
+  FileDescriptor slave =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(spath, O_RDWR | O_NONBLOCK | O_TRUNC));
+
+  EXPECT_THAT(truncate(kMasterPath, 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(truncate(spath.c_str(), 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(ftruncate(master.get(), 0), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(ftruncate(slave.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(BasicPtyTest, StatUnopenedMaster) {
   struct stat s;
-  ASSERT_THAT(stat("/dev/ptmx", &s), SyscallSucceeds());
+  ASSERT_THAT(stat(kMasterPath, &s), SyscallSucceeds());
 
   EXPECT_EQ(s.st_rdev, makedev(TTYAUX_MAJOR, kPtmxMinor));
   EXPECT_EQ(s.st_size, 0);
diff --git a/test/util/pty_util.cc b/test/util/pty_util.cc
index c0fd9a095..c01f916aa 100644
--- a/test/util/pty_util.cc
+++ b/test/util/pty_util.cc
@@ -24,6 +24,14 @@ namespace gvisor {
 namespace testing {
 
 PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master) {
+  PosixErrorOr<int> n = SlaveID(master);
+  if (!n.ok()) {
+    return PosixErrorOr<FileDescriptor>(n.error());
+  }
+  return Open(absl::StrCat("/dev/pts/", n.ValueOrDie()), O_RDWR | O_NONBLOCK);
+}
+
+PosixErrorOr<int> SlaveID(const FileDescriptor& master) {
   // Get pty index.
   int n;
   int ret = ioctl(master.get(), TIOCGPTN, &n);
@@ -38,7 +46,7 @@ PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master) {
     return PosixError(errno, "ioctl(TIOSPTLCK) failed");
   }
 
-  return Open(absl::StrCat("/dev/pts/", n), O_RDWR | O_NONBLOCK);
+  return n;
 }
 
 }  // namespace testing
diff --git a/test/util/pty_util.h b/test/util/pty_util.h
index 367b14f15..0722da379 100644
--- a/test/util/pty_util.h
+++ b/test/util/pty_util.h
@@ -24,6 +24,9 @@ namespace testing {
 // Opens the slave end of the passed master as R/W and nonblocking.
 PosixErrorOr<FileDescriptor> OpenSlave(const FileDescriptor& master);
 
+// Get the number of the slave end of the master.
+PosixErrorOr<int> SlaveID(const FileDescriptor& master);
+
 }  // namespace testing
 }  // namespace gvisor
 
-- 
cgit v1.2.3


From af323eb7c1830053627de6161f8ce73ac5f06d4e Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 14 Nov 2019 17:02:59 -0800
Subject: Fix return codes for {get,set}sockopt for some nullptr cases.

Updates #1092

PiperOrigin-RevId: 280547239
---
 pkg/sentry/syscalls/linux/sys_socket.go  | 23 +++++++++--------------
 test/syscalls/linux/socket_ip_unbound.cc | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 14 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index b5a72ce63..ab1001f16 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -447,16 +447,13 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 		return 0, nil, syserror.ENOTSOCK
 	}
 
-	// Read the length if present. Reject negative values.
+	// Read the length. Reject negative values.
 	optLen := int32(0)
-	if optLenAddr != 0 {
-		if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
-			return 0, nil, err
-		}
-
-		if optLen < 0 {
-			return 0, nil, syserror.EINVAL
-		}
+	if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+		return 0, nil, err
+	}
+	if optLen < 0 {
+		return 0, nil, syserror.EINVAL
 	}
 
 	// Call syscall implementation then copy both value and value len out.
@@ -465,11 +462,9 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 		return 0, nil, e.ToError()
 	}
 
-	if optLenAddr != 0 {
-		vLen := int32(binary.Size(v))
-		if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
-			return 0, nil, err
-		}
+	vLen := int32(binary.Size(v))
+	if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+		return 0, nil, err
 	}
 
 	if v != nil {
diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc
index b02872308..b6754111f 100644
--- a/test/syscalls/linux/socket_ip_unbound.cc
+++ b/test/syscalls/linux/socket_ip_unbound.cc
@@ -354,6 +354,38 @@ TEST_P(IPUnboundSocketTest, InvalidNegativeTOS) {
   EXPECT_EQ(get, expect);
 }
 
+TEST_P(IPUnboundSocketTest, NullTOS) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  TOSOption t = GetTOSOption(GetParam().domain);
+  int set_sz = sizeof(int);
+  if (GetParam().domain == AF_INET) {
+    EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, nullptr, set_sz),
+                SyscallFailsWithErrno(EFAULT));
+  } else {  // AF_INET6
+    // The AF_INET6 behavior is not yet compatible. gVisor will try to read
+    // optval from user memory at syscall handler, it needs substantial
+    // refactoring to implement this behavior just for IPv6.
+    if (IsRunningOnGvisor()) {
+      EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, nullptr, set_sz),
+                  SyscallFailsWithErrno(EFAULT));
+    } else {
+      // Linux's IPv6 stack treats nullptr optval as input of 0, so the call
+      // succeeds. (net/ipv6/ipv6_sockglue.c, do_ipv6_setsockopt())
+      //
+      // Linux's implementation would need fixing as passing a nullptr as optval
+      // and non-zero optlen may not be valid.
+      EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, nullptr, set_sz),
+                  SyscallSucceedsWithValue(0));
+    }
+  }
+  socklen_t get_sz = sizeof(int);
+  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, nullptr, &get_sz),
+              SyscallFailsWithErrno(EFAULT));
+  int get = -1;
+  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, nullptr),
+              SyscallFailsWithErrno(EFAULT));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     IPUnboundSockets, IPUnboundSocketTest,
     ::testing::ValuesIn(VecCat<SocketKind>(VecCat<SocketKind>(
-- 
cgit v1.2.3


From c0f89eba6ebdec08460bd796fc62d6aef674d141 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 21 Nov 2019 11:29:49 -0800
Subject: Import and structure cleanup.

PiperOrigin-RevId: 281795269
---
 pkg/eventchannel/BUILD                             |   1 +
 pkg/flipcall/BUILD                                 |   2 +-
 pkg/flipcall/flipcall_unsafe.go                    |  10 +-
 pkg/sentry/BUILD                                   |   3 +
 pkg/sentry/fs/BUILD                                |   2 +-
 pkg/sentry/fs/fdpipe/pipe_opener_test.go           |   1 +
 pkg/sentry/fs/overlay.go                           |   4 +-
 pkg/sentry/fsimpl/memfs/BUILD                      |   3 +-
 pkg/sentry/kernel/BUILD                            |   4 +-
 pkg/sentry/kernel/auth/BUILD                       |   2 +-
 pkg/sentry/kernel/futex/BUILD                      |   2 +-
 pkg/sentry/kernel/signalfd/BUILD                   |   4 +-
 pkg/sentry/kernel/task.go                          |   4 +-
 pkg/sentry/mm/BUILD                                |   2 +-
 pkg/sentry/mm/mm.go                                |   6 +-
 pkg/sentry/strace/strace.proto                     |   3 +-
 pkg/sentry/time/BUILD                              |   4 +-
 pkg/sentry/vfs/BUILD                               |   2 +-
 pkg/sentry/vfs/mount_unsafe.go                     |   4 +-
 pkg/state/object.proto                             |  56 ++++----
 pkg/syncutil/BUILD                                 |  54 ++++++++
 pkg/syncutil/LICENSE                               |  27 ++++
 pkg/syncutil/README.md                             |   5 +
 pkg/syncutil/atomicptr_unsafe.go                   |  47 +++++++
 pkg/syncutil/atomicptrtest/BUILD                   |  29 ++++
 pkg/syncutil/atomicptrtest/atomicptr_test.go       |  31 +++++
 pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go   |  21 +++
 pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go   |  16 +++
 pkg/syncutil/downgradable_rwmutex_test.go          | 150 ++++++++++++++++++++
 pkg/syncutil/downgradable_rwmutex_unsafe.go        | 143 +++++++++++++++++++
 pkg/syncutil/memmove_unsafe.go                     |  28 ++++
 pkg/syncutil/norace_unsafe.go                      |  35 +++++
 pkg/syncutil/race_unsafe.go                        |  41 ++++++
 pkg/syncutil/seqatomic_unsafe.go                   |  72 ++++++++++
 pkg/syncutil/seqatomictest/BUILD                   |  35 +++++
 pkg/syncutil/seqatomictest/seqatomic_test.go       | 132 ++++++++++++++++++
 pkg/syncutil/seqcount.go                           | 149 ++++++++++++++++++++
 pkg/syncutil/seqcount_test.go                      | 153 +++++++++++++++++++++
 pkg/syncutil/syncutil.go                           |   7 +
 test/syscalls/linux/accept_bind.cc                 |   2 +
 test/syscalls/linux/accept_bind_stream.cc          |   2 +
 test/syscalls/linux/chmod.cc                       |   1 +
 test/syscalls/linux/chroot.cc                      |   1 +
 test/syscalls/linux/clock_gettime.cc               |   1 +
 test/syscalls/linux/concurrency.cc                 |   1 +
 test/syscalls/linux/exec_binary.cc                 |   1 +
 test/syscalls/linux/file_base.h                    |   1 +
 test/syscalls/linux/flock.cc                       |   1 +
 test/syscalls/linux/fork.cc                        |   1 +
 test/syscalls/linux/getdents.cc                    |   1 +
 test/syscalls/linux/ip_socket_test_util.cc         |   5 +-
 test/syscalls/linux/memory_accounting.cc           |   1 +
 test/syscalls/linux/mlock.cc                       |   1 +
 test/syscalls/linux/mmap.cc                        |   1 +
 test/syscalls/linux/mount.cc                       |   1 +
 test/syscalls/linux/read.cc                        |   1 +
 test/syscalls/linux/rename.cc                      |   1 +
 test/syscalls/linux/seccomp.cc                     |   1 +
 test/syscalls/linux/select.cc                      |   1 +
 test/syscalls/linux/shm.cc                         |   1 -
 test/syscalls/linux/socket_blocking.cc             |   1 +
 test/syscalls/linux/socket_ip_loopback_blocking.cc |   1 +
 .../linux/socket_ip_tcp_generic_loopback.cc        |   1 +
 .../linux/socket_ip_tcp_loopback_blocking.cc       |   1 +
 .../linux/socket_ip_tcp_loopback_nonblock.cc       |   1 +
 .../socket_ipv4_tcp_unbound_external_networking.cc |   1 +
 ...et_ipv4_tcp_unbound_external_networking_test.cc |   3 +-
 ...et_ipv4_udp_unbound_external_networking_test.cc |   3 +-
 test/syscalls/linux/socket_netlink_util.cc         |   4 +-
 test/syscalls/linux/socket_unix_blocking_local.cc  |   3 +-
 test/syscalls/linux/socket_unix_dgram.cc           |   1 +
 .../linux/socket_unix_dgram_non_blocking.cc        |   1 +
 .../linux/socket_unix_non_stream_blocking_local.cc |   3 +-
 test/syscalls/linux/socket_unix_seqpacket.cc       |   1 +
 .../linux/socket_unix_stream_blocking_local.cc     |   3 +-
 .../linux/socket_unix_stream_nonblock_local.cc     |   3 +-
 .../syscalls/linux/socket_unix_unbound_abstract.cc |   1 +
 .../linux/socket_unix_unbound_filesystem.cc        |   1 +
 .../linux/socket_unix_unbound_seqpacket.cc         |   1 +
 test/syscalls/linux/socket_unix_unbound_stream.cc  |   1 +
 test/syscalls/linux/sync.cc                        |   3 +-
 test/syscalls/linux/truncate.cc                    |   1 +
 .../syscalls/linux/unix_domain_socket_test_util.cc |   1 +
 test/syscalls/linux/unix_domain_socket_test_util.h |   1 +
 test/syscalls/linux/utimes.cc                      |   1 +
 test/syscalls/linux/vdso_clock_gettime.cc          |   1 +
 test/util/fs_util_test.cc                          |   4 +-
 test/util/mount_util.h                             |   1 +
 test/util/posix_error_test.cc                      |   1 +
 test/util/rlimit_util.cc                           |   1 +
 test/util/signal_util.cc                           |   1 +
 test/util/signal_util.h                            |   1 +
 test/util/temp_path.h                              |   1 +
 test/util/test_util_test.cc                        |   1 +
 third_party/gvsync/BUILD                           |  53 -------
 third_party/gvsync/LICENSE                         |  27 ----
 third_party/gvsync/README.md                       |   3 -
 third_party/gvsync/atomicptr_unsafe.go             |  47 -------
 third_party/gvsync/atomicptrtest/BUILD             |  28 ----
 third_party/gvsync/atomicptrtest/atomicptr_test.go |  31 -----
 .../gvsync/downgradable_rwmutex_1_12_unsafe.go     |  21 ---
 .../gvsync/downgradable_rwmutex_1_13_unsafe.go     |  16 ---
 third_party/gvsync/downgradable_rwmutex_test.go    | 150 --------------------
 third_party/gvsync/downgradable_rwmutex_unsafe.go  | 143 -------------------
 third_party/gvsync/gvsync.go                       |   7 -
 third_party/gvsync/memmove_unsafe.go               |  28 ----
 third_party/gvsync/norace_unsafe.go                |  35 -----
 third_party/gvsync/race_unsafe.go                  |  41 ------
 third_party/gvsync/seqatomic_unsafe.go             |  72 ----------
 third_party/gvsync/seqatomictest/BUILD             |  34 -----
 third_party/gvsync/seqatomictest/seqatomic_test.go | 132 ------------------
 third_party/gvsync/seqcount.go                     | 149 --------------------
 third_party/gvsync/seqcount_test.go                | 153 ---------------------
 tools/go_marshal/test/BUILD                        |   3 +-
 tools/go_marshal/test/external/BUILD               |   4 +-
 115 files changed, 1302 insertions(+), 1250 deletions(-)
 create mode 100644 pkg/syncutil/BUILD
 create mode 100644 pkg/syncutil/LICENSE
 create mode 100644 pkg/syncutil/README.md
 create mode 100644 pkg/syncutil/atomicptr_unsafe.go
 create mode 100644 pkg/syncutil/atomicptrtest/BUILD
 create mode 100644 pkg/syncutil/atomicptrtest/atomicptr_test.go
 create mode 100644 pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
 create mode 100644 pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
 create mode 100644 pkg/syncutil/downgradable_rwmutex_test.go
 create mode 100644 pkg/syncutil/downgradable_rwmutex_unsafe.go
 create mode 100644 pkg/syncutil/memmove_unsafe.go
 create mode 100644 pkg/syncutil/norace_unsafe.go
 create mode 100644 pkg/syncutil/race_unsafe.go
 create mode 100644 pkg/syncutil/seqatomic_unsafe.go
 create mode 100644 pkg/syncutil/seqatomictest/BUILD
 create mode 100644 pkg/syncutil/seqatomictest/seqatomic_test.go
 create mode 100644 pkg/syncutil/seqcount.go
 create mode 100644 pkg/syncutil/seqcount_test.go
 create mode 100644 pkg/syncutil/syncutil.go
 delete mode 100644 third_party/gvsync/BUILD
 delete mode 100644 third_party/gvsync/LICENSE
 delete mode 100644 third_party/gvsync/README.md
 delete mode 100644 third_party/gvsync/atomicptr_unsafe.go
 delete mode 100644 third_party/gvsync/atomicptrtest/BUILD
 delete mode 100644 third_party/gvsync/atomicptrtest/atomicptr_test.go
 delete mode 100644 third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go
 delete mode 100644 third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go
 delete mode 100644 third_party/gvsync/downgradable_rwmutex_test.go
 delete mode 100644 third_party/gvsync/downgradable_rwmutex_unsafe.go
 delete mode 100644 third_party/gvsync/gvsync.go
 delete mode 100644 third_party/gvsync/memmove_unsafe.go
 delete mode 100644 third_party/gvsync/norace_unsafe.go
 delete mode 100644 third_party/gvsync/race_unsafe.go
 delete mode 100644 third_party/gvsync/seqatomic_unsafe.go
 delete mode 100644 third_party/gvsync/seqatomictest/BUILD
 delete mode 100644 third_party/gvsync/seqatomictest/seqatomic_test.go
 delete mode 100644 third_party/gvsync/seqcount.go
 delete mode 100644 third_party/gvsync/seqcount_test.go

(limited to 'test/syscalls/linux')

diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 71f2abc83..0b4b7cc44 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -25,6 +25,7 @@ go_library(
 proto_library(
     name = "eventchannel_proto",
     srcs = ["event.proto"],
+    visibility = ["//:sandbox"],
 )
 
 go_proto_library(
diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD
index 5643d5f26..e590a71ba 100644
--- a/pkg/flipcall/BUILD
+++ b/pkg/flipcall/BUILD
@@ -19,7 +19,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/memutil",
-        "//third_party/gvsync",
+        "//pkg/syncutil",
     ],
 )
 
diff --git a/pkg/flipcall/flipcall_unsafe.go b/pkg/flipcall/flipcall_unsafe.go
index a37952637..27b8939fc 100644
--- a/pkg/flipcall/flipcall_unsafe.go
+++ b/pkg/flipcall/flipcall_unsafe.go
@@ -18,7 +18,7 @@ import (
 	"reflect"
 	"unsafe"
 
-	"gvisor.dev/gvisor/third_party/gvsync"
+	"gvisor.dev/gvisor/pkg/syncutil"
 )
 
 // Packets consist of a 16-byte header followed by an arbitrarily-sized
@@ -75,13 +75,13 @@ func (ep *Endpoint) Data() []byte {
 var ioSync int64
 
 func raceBecomeActive() {
-	if gvsync.RaceEnabled {
-		gvsync.RaceAcquire((unsafe.Pointer)(&ioSync))
+	if syncutil.RaceEnabled {
+		syncutil.RaceAcquire((unsafe.Pointer)(&ioSync))
 	}
 }
 
 func raceBecomeInactive() {
-	if gvsync.RaceEnabled {
-		gvsync.RaceReleaseMerge((unsafe.Pointer)(&ioSync))
+	if syncutil.RaceEnabled {
+		syncutil.RaceReleaseMerge((unsafe.Pointer)(&ioSync))
 	}
 }
diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD
index 2d6379c86..2a7122957 100644
--- a/pkg/sentry/BUILD
+++ b/pkg/sentry/BUILD
@@ -10,5 +10,8 @@ package_group(
         "//runsc/...",
         # Code generated by go_marshal relies on go_marshal libraries.
         "//tools/go_marshal/...",
+
+        # Keep the old paths as a temporary measure.
+        "//third_party/golang/gvisor/pkg/sentry/...",
     ],
 )
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 378602cc9..c035ffff7 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -68,9 +68,9 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/state",
+        "//pkg/syncutil",
         "//pkg/syserror",
         "//pkg/waiter",
-        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
index 8e4d839e1..577445148 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
@@ -25,6 +25,7 @@ import (
 	"time"
 
 	"github.com/google/uuid"
+
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 1d3ff39e0..25573e986 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -23,8 +23,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syncutil"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/third_party/gvsync"
 )
 
 // The virtual filesystem implements an overlay configuration. For a high-level
@@ -199,7 +199,7 @@ type overlayEntry struct {
 	upper *Inode
 
 	// dirCacheMu protects dirCache.
-	dirCacheMu gvsync.DowngradableRWMutex `state:"nosave"`
+	dirCacheMu syncutil.DowngradableRWMutex `state:"nosave"`
 
 	// dirCache is cache of DentAttrs from upper and lower Inodes.
 	dirCache *SortedDentryMap
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index 04d667273..952b20c51 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -1,10 +1,9 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
 go_template_instance(
     name = "dentry_list",
     out = "dentry_list.go",
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index e041c51b3..2706927ff 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -35,7 +35,7 @@ go_template_instance(
     out = "seqatomic_taskgoroutineschedinfo_unsafe.go",
     package = "kernel",
     suffix = "TaskGoroutineSchedInfo",
-    template = "//third_party/gvsync:generic_seqatomic",
+    template = "//pkg/syncutil:generic_seqatomic",
     types = {
         "Value": "TaskGoroutineSchedInfo",
     },
@@ -209,12 +209,12 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/state/statefile",
+        "//pkg/syncutil",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/stack",
         "//pkg/waiter",
-        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 51de4568a..04c244447 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -8,7 +8,7 @@ go_template_instance(
     out = "atomicptr_credentials_unsafe.go",
     package = "auth",
     suffix = "Credentials",
-    template = "//third_party/gvsync:generic_atomicptr",
+    template = "//pkg/syncutil:generic_atomicptr",
     types = {
         "Value": "Credentials",
     },
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 34286c7a8..75ec31761 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -9,7 +9,7 @@ go_template_instance(
     out = "atomicptr_bucket_unsafe.go",
     package = "futex",
     suffix = "Bucket",
-    template = "//third_party/gvsync:generic_atomicptr",
+    template = "//pkg/syncutil:generic_atomicptr",
     types = {
         "Value": "bucket",
     },
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 50b69d154..9f7e19b4d 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "signalfd",
     srcs = ["signalfd.go"],
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 9be3dae3c..80c8e5464 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -35,8 +35,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syncutil"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/third_party/gvsync"
 )
 
 // Task represents a thread of execution in the untrusted app.  It
@@ -83,7 +83,7 @@ type Task struct {
 	//
 	// gosched is protected by goschedSeq. gosched is owned by the task
 	// goroutine.
-	goschedSeq gvsync.SeqCount `state:"nosave"`
+	goschedSeq syncutil.SeqCount `state:"nosave"`
 	gosched    TaskGoroutineSchedInfo
 
 	// yieldCount is the number of times the task goroutine has called
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index a804b8b5c..839931f67 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -118,9 +118,9 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/syncutil",
         "//pkg/syserror",
         "//pkg/tcpip/buffer",
-        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index f350e0109..58a5c186d 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -44,7 +44,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/third_party/gvsync"
+	"gvisor.dev/gvisor/pkg/syncutil"
 )
 
 // MemoryManager implements a virtual address space.
@@ -82,7 +82,7 @@ type MemoryManager struct {
 	users int32
 
 	// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
-	mappingMu gvsync.DowngradableRWMutex `state:"nosave"`
+	mappingMu syncutil.DowngradableRWMutex `state:"nosave"`
 
 	// vmas stores virtual memory areas. Since vmas are stored by value,
 	// clients should usually use vmaIterator.ValuePtr() instead of
@@ -125,7 +125,7 @@ type MemoryManager struct {
 
 	// activeMu is loosely analogous to Linux's struct
 	// mm_struct::page_table_lock.
-	activeMu gvsync.DowngradableRWMutex `state:"nosave"`
+	activeMu syncutil.DowngradableRWMutex `state:"nosave"`
 
 	// pmas stores platform mapping areas used to implement vmas. Since pmas
 	// are stored by value, clients should usually use pmaIterator.ValuePtr()
diff --git a/pkg/sentry/strace/strace.proto b/pkg/sentry/strace/strace.proto
index 4b2f73a5f..906c52c51 100644
--- a/pkg/sentry/strace/strace.proto
+++ b/pkg/sentry/strace/strace.proto
@@ -32,8 +32,7 @@ message Strace {
   }
 }
 
-message StraceEnter {
-}
+message StraceEnter {}
 
 message StraceExit {
   // Return value formatted as string.
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index d3a4cd943..18e212dff 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -9,7 +9,7 @@ go_template_instance(
     out = "seqatomic_parameters_unsafe.go",
     package = "time",
     suffix = "Parameters",
-    template = "//third_party/gvsync:generic_seqatomic",
+    template = "//pkg/syncutil:generic_seqatomic",
     types = {
         "Value": "Parameters",
     },
@@ -36,8 +36,8 @@ go_library(
     deps = [
         "//pkg/log",
         "//pkg/metric",
+        "//pkg/syncutil",
         "//pkg/syserror",
-        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 4f2c2de9f..74a325309 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -33,9 +33,9 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
         "//pkg/sentry/usermem",
+        "//pkg/syncutil",
         "//pkg/syserror",
         "//pkg/waiter",
-        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index 75e6c7dfa..c98b42f91 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -26,7 +26,7 @@ import (
 	"sync/atomic"
 	"unsafe"
 
-	"gvisor.dev/gvisor/third_party/gvsync"
+	"gvisor.dev/gvisor/pkg/syncutil"
 )
 
 // mountKey represents the location at which a Mount is mounted. It is
@@ -72,7 +72,7 @@ type mountTable struct {
 	// intrinsics and inline assembly, limiting the performance of this
 	// approach.)
 
-	seq  gvsync.SeqCount
+	seq  syncutil.SeqCount
 	seed uint32 // for hashing keys
 
 	// size holds both length (number of elements) and capacity (number of
diff --git a/pkg/state/object.proto b/pkg/state/object.proto
index 952289069..5ebcfb151 100644
--- a/pkg/state/object.proto
+++ b/pkg/state/object.proto
@@ -18,8 +18,8 @@ package gvisor.state.statefile;
 
 // Slice is a slice value.
 message Slice {
-  uint32 length    = 1;
-  uint32 capacity  = 2;
+  uint32 length = 1;
+  uint32 capacity = 2;
   uint64 ref_value = 3;
 }
 
@@ -30,13 +30,13 @@ message Array {
 
 // Map is a map value.
 message Map {
-  repeated Object keys   = 1;
+  repeated Object keys = 1;
   repeated Object values = 2;
 }
 
 // Interface is an interface value.
 message Interface {
-  string type  = 1;
+  string type = 1;
   Object value = 2;
 }
 
@@ -47,7 +47,7 @@ message Struct {
 
 // Field encodes a single field.
 message Field {
-  string name  = 1;
+  string name = 1;
   Object value = 2;
 }
 
@@ -113,28 +113,28 @@ message Float32s {
 // Note that ref_value references an Object.id, below.
 message Object {
   oneof value {
-    bool      bool_value          = 1;
-    bytes     string_value        = 2;
-    int64     int64_value         = 3;
-    uint64    uint64_value        = 4;
-    double    double_value        = 5;
-    uint64    ref_value           = 6;
-    Slice     slice_value         = 7;
-    Array     array_value         = 8;
-    Interface interface_value     = 9;
-    Struct    struct_value        = 10;
-    Map       map_value           = 11;
-    bytes     byte_array_value    = 12;
-    Uint16s   uint16_array_value  = 13;
-    Uint32s   uint32_array_value  = 14;
-    Uint64s   uint64_array_value  = 15;
-    Uintptrs  uintptr_array_value = 16;
-    Int8s     int8_array_value    = 17;
-    Int16s    int16_array_value   = 18;
-    Int32s    int32_array_value   = 19;
-    Int64s    int64_array_value   = 20;
-    Bools     bool_array_value    = 21;
-    Float64s  float64_array_value = 22;
-    Float32s  float32_array_value = 23;
+    bool bool_value = 1;
+    bytes string_value = 2;
+    int64 int64_value = 3;
+    uint64 uint64_value = 4;
+    double double_value = 5;
+    uint64 ref_value = 6;
+    Slice slice_value = 7;
+    Array array_value = 8;
+    Interface interface_value = 9;
+    Struct struct_value = 10;
+    Map map_value = 11;
+    bytes byte_array_value = 12;
+    Uint16s uint16_array_value = 13;
+    Uint32s uint32_array_value = 14;
+    Uint64s uint64_array_value = 15;
+    Uintptrs uintptr_array_value = 16;
+    Int8s int8_array_value = 17;
+    Int16s int16_array_value = 18;
+    Int32s int32_array_value = 19;
+    Int64s int64_array_value = 20;
+    Bools bool_array_value = 21;
+    Float64s float64_array_value = 22;
+    Float32s float32_array_value = 23;
   }
 }
diff --git a/pkg/syncutil/BUILD b/pkg/syncutil/BUILD
new file mode 100644
index 000000000..b06a90bef
--- /dev/null
+++ b/pkg/syncutil/BUILD
@@ -0,0 +1,54 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+exports_files(["LICENSE"])
+
+go_template(
+    name = "generic_atomicptr",
+    srcs = ["atomicptr_unsafe.go"],
+    types = [
+        "Value",
+    ],
+)
+
+go_template(
+    name = "generic_seqatomic",
+    srcs = ["seqatomic_unsafe.go"],
+    types = [
+        "Value",
+    ],
+    deps = [
+        ":sync",
+    ],
+)
+
+go_library(
+    name = "syncutil",
+    srcs = [
+        "downgradable_rwmutex_1_12_unsafe.go",
+        "downgradable_rwmutex_1_13_unsafe.go",
+        "downgradable_rwmutex_unsafe.go",
+        "memmove_unsafe.go",
+        "norace_unsafe.go",
+        "race_unsafe.go",
+        "seqcount.go",
+        "syncutil.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/syncutil",
+)
+
+go_test(
+    name = "syncutil_test",
+    size = "small",
+    srcs = [
+        "downgradable_rwmutex_test.go",
+        "seqcount_test.go",
+    ],
+    embed = [":syncutil"],
+)
diff --git a/pkg/syncutil/LICENSE b/pkg/syncutil/LICENSE
new file mode 100644
index 000000000..6a66aea5e
--- /dev/null
+++ b/pkg/syncutil/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pkg/syncutil/README.md b/pkg/syncutil/README.md
new file mode 100644
index 000000000..2183c4e20
--- /dev/null
+++ b/pkg/syncutil/README.md
@@ -0,0 +1,5 @@
+# Syncutil
+
+This package provides additional synchronization primitives not provided by the
+Go stdlib 'sync' package. It is partially derived from the upstream 'sync'
+package from go1.10.
diff --git a/pkg/syncutil/atomicptr_unsafe.go b/pkg/syncutil/atomicptr_unsafe.go
new file mode 100644
index 000000000..525c4beed
--- /dev/null
+++ b/pkg/syncutil/atomicptr_unsafe.go
@@ -0,0 +1,47 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package template doesn't exist. This file must be instantiated using the
+// go_template_instance rule in tools/go_generics/defs.bzl.
+package template
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// Value is a required type parameter.
+type Value struct{}
+
+// An AtomicPtr is a pointer to a value of type Value that can be atomically
+// loaded and stored. The zero value of an AtomicPtr represents nil.
+//
+// Note that copying AtomicPtr by value performs a non-atomic read of the
+// stored pointer, which is unsafe if Store() can be called concurrently; in
+// this case, do `dst.Store(src.Load())` instead.
+//
+// +stateify savable
+type AtomicPtr struct {
+	ptr unsafe.Pointer `state:".(*Value)"`
+}
+
+func (p *AtomicPtr) savePtr() *Value {
+	return p.Load()
+}
+
+func (p *AtomicPtr) loadPtr(v *Value) {
+	p.Store(v)
+}
+
+// Load returns the value set by the most recent Store. It returns nil if there
+// has been no previous call to Store.
+func (p *AtomicPtr) Load() *Value {
+	return (*Value)(atomic.LoadPointer(&p.ptr))
+}
+
+// Store sets the value returned by Load to x.
+func (p *AtomicPtr) Store(x *Value) {
+	atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x))
+}
diff --git a/pkg/syncutil/atomicptrtest/BUILD b/pkg/syncutil/atomicptrtest/BUILD
new file mode 100644
index 000000000..63f411a90
--- /dev/null
+++ b/pkg/syncutil/atomicptrtest/BUILD
@@ -0,0 +1,29 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "atomicptr_int",
+    out = "atomicptr_int_unsafe.go",
+    package = "atomicptr",
+    suffix = "Int",
+    template = "//pkg/syncutil:generic_atomicptr",
+    types = {
+        "Value": "int",
+    },
+)
+
+go_library(
+    name = "atomicptr",
+    srcs = ["atomicptr_int_unsafe.go"],
+    importpath = "gvisor.dev/gvisor/pkg/syncutil/atomicptr",
+)
+
+go_test(
+    name = "atomicptr_test",
+    size = "small",
+    srcs = ["atomicptr_test.go"],
+    embed = [":atomicptr"],
+)
diff --git a/pkg/syncutil/atomicptrtest/atomicptr_test.go b/pkg/syncutil/atomicptrtest/atomicptr_test.go
new file mode 100644
index 000000000..8fdc5112e
--- /dev/null
+++ b/pkg/syncutil/atomicptrtest/atomicptr_test.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomicptr
+
+import (
+	"testing"
+)
+
+func newInt(val int) *int {
+	return &val
+}
+
+func TestAtomicPtr(t *testing.T) {
+	var p AtomicPtrInt
+	if got := p.Load(); got != nil {
+		t.Errorf("initial value is %p (%v), wanted nil", got, got)
+	}
+	want := newInt(42)
+	p.Store(want)
+	if got := p.Load(); got != want {
+		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
+	}
+	want = newInt(100)
+	p.Store(want)
+	if got := p.Load(); got != want {
+		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
+	}
+}
diff --git a/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
new file mode 100644
index 000000000..7c6336e62
--- /dev/null
+++ b/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
@@ -0,0 +1,21 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.12
+// +build !go1.13
+
+// TODO(b/133868570): Delete once Go 1.12 is no longer supported.
+
+package syncutil
+
+import _ "unsafe"
+
+//go:linkname runtimeSemrelease112 sync.runtime_Semrelease
+func runtimeSemrelease112(s *uint32, handoff bool)
+
+func runtimeSemrelease(s *uint32, handoff bool, skipframes int) {
+	// 'skipframes' is only available starting from 1.13.
+	runtimeSemrelease112(s, handoff)
+}
diff --git a/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
new file mode 100644
index 000000000..3c3673119
--- /dev/null
+++ b/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
@@ -0,0 +1,16 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.15
+
+// Check go:linkname function signatures when updating Go version.
+
+package syncutil
+
+import _ "unsafe"
+
+//go:linkname runtimeSemrelease sync.runtime_Semrelease
+func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
diff --git a/pkg/syncutil/downgradable_rwmutex_test.go b/pkg/syncutil/downgradable_rwmutex_test.go
new file mode 100644
index 000000000..ffaf7ecc7
--- /dev/null
+++ b/pkg/syncutil/downgradable_rwmutex_test.go
@@ -0,0 +1,150 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GOMAXPROCS=10 go test
+
+// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the
+// addition of downgradingWriter and the renaming of num_iterations to
+// numIterations to shut up Golint.
+
+package syncutil
+
+import (
+	"fmt"
+	"runtime"
+	"sync/atomic"
+	"testing"
+)
+
+func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) {
+	m.RLock()
+	clocked <- true
+	<-cunlock
+	m.RUnlock()
+	cdone <- true
+}
+
+func doTestParallelReaders(numReaders, gomaxprocs int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	var m DowngradableRWMutex
+	clocked := make(chan bool)
+	cunlock := make(chan bool)
+	cdone := make(chan bool)
+	for i := 0; i < numReaders; i++ {
+		go parallelReader(&m, clocked, cunlock, cdone)
+	}
+	// Wait for all parallel RLock()s to succeed.
+	for i := 0; i < numReaders; i++ {
+		<-clocked
+	}
+	for i := 0; i < numReaders; i++ {
+		cunlock <- true
+	}
+	// Wait for the goroutines to finish.
+	for i := 0; i < numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestParallelReaders(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	doTestParallelReaders(1, 4)
+	doTestParallelReaders(3, 4)
+	doTestParallelReaders(4, 2)
+}
+
+func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.RLock()
+		n := atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.Unlock()
+	}
+	cdone <- true
+}
+
+func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.DowngradeLock()
+		n = atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		n = atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	// Number of active readers + 10000 * number of active writers.
+	var activity int32
+	var rwm DowngradableRWMutex
+	cdone := make(chan bool)
+	go writer(&rwm, numIterations, &activity, cdone)
+	go downgradingWriter(&rwm, numIterations, &activity, cdone)
+	var i int
+	for i = 0; i < numReaders/2; i++ {
+		go reader(&rwm, numIterations, &activity, cdone)
+	}
+	go writer(&rwm, numIterations, &activity, cdone)
+	go downgradingWriter(&rwm, numIterations, &activity, cdone)
+	for ; i < numReaders; i++ {
+		go reader(&rwm, numIterations, &activity, cdone)
+	}
+	// Wait for the 4 writers and all readers to finish.
+	for i := 0; i < 4+numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestDowngradableRWMutex(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	n := 1000
+	if testing.Short() {
+		n = 5
+	}
+	HammerDowngradableRWMutex(1, 1, n)
+	HammerDowngradableRWMutex(1, 3, n)
+	HammerDowngradableRWMutex(1, 10, n)
+	HammerDowngradableRWMutex(4, 1, n)
+	HammerDowngradableRWMutex(4, 3, n)
+	HammerDowngradableRWMutex(4, 10, n)
+	HammerDowngradableRWMutex(10, 1, n)
+	HammerDowngradableRWMutex(10, 3, n)
+	HammerDowngradableRWMutex(10, 10, n)
+	HammerDowngradableRWMutex(10, 5, n)
+}
diff --git a/pkg/syncutil/downgradable_rwmutex_unsafe.go b/pkg/syncutil/downgradable_rwmutex_unsafe.go
new file mode 100644
index 000000000..07feca402
--- /dev/null
+++ b/pkg/syncutil/downgradable_rwmutex_unsafe.go
@@ -0,0 +1,143 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.12
+// +build !go1.15
+
+// Check go:linkname function signatures when updating Go version.
+
+// This is mostly copied from the standard library's sync/rwmutex.go.
+//
+// Happens-before relationships indicated to the race detector:
+// - Unlock -> Lock (via writerSem)
+// - Unlock -> RLock (via readerSem)
+// - RUnlock -> Lock (via writerSem)
+// - DowngradeLock -> RLock (via readerSem)
+
+package syncutil
+
+import (
+	"sync"
+	"sync/atomic"
+	"unsafe"
+)
+
+//go:linkname runtimeSemacquire sync.runtime_Semacquire
+func runtimeSemacquire(s *uint32)
+
+// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
+// method.
+type DowngradableRWMutex struct {
+	w           sync.Mutex // held if there are pending writers
+	writerSem   uint32     // semaphore for writers to wait for completing readers
+	readerSem   uint32     // semaphore for readers to wait for completing writers
+	readerCount int32      // number of pending readers
+	readerWait  int32      // number of departing readers
+}
+
+const rwmutexMaxReaders = 1 << 30
+
+// RLock locks rw for reading.
+func (rw *DowngradableRWMutex) RLock() {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
+		// A writer is pending, wait for it.
+		runtimeSemacquire(&rw.readerSem)
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.readerSem))
+	}
+}
+
+// RUnlock undoes a single RLock call.
+func (rw *DowngradableRWMutex) RUnlock() {
+	if RaceEnabled {
+		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
+		RaceDisable()
+	}
+	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
+		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
+			panic("RUnlock of unlocked DowngradableRWMutex")
+		}
+		// A writer is pending.
+		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
+			// The last reader unblocks the writer.
+			runtimeSemrelease(&rw.writerSem, false, 0)
+		}
+	}
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
+
+// Lock locks rw for writing.
+func (rw *DowngradableRWMutex) Lock() {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	// First, resolve competition with other writers.
+	rw.w.Lock()
+	// Announce to readers there is a pending writer.
+	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
+	// Wait for active readers.
+	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
+		runtimeSemacquire(&rw.writerSem)
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.writerSem))
+	}
+}
+
+// Unlock unlocks rw for writing.
+func (rw *DowngradableRWMutex) Unlock() {
+	if RaceEnabled {
+		RaceRelease(unsafe.Pointer(&rw.writerSem))
+		RaceRelease(unsafe.Pointer(&rw.readerSem))
+		RaceDisable()
+	}
+	// Announce to readers there is no active writer.
+	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
+	if r >= rwmutexMaxReaders {
+		panic("Unlock of unlocked DowngradableRWMutex")
+	}
+	// Unblock blocked readers, if any.
+	for i := 0; i < int(r); i++ {
+		runtimeSemrelease(&rw.readerSem, false, 0)
+	}
+	// Allow other writers to proceed.
+	rw.w.Unlock()
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
+
+// DowngradeLock atomically unlocks rw for writing and locks it for reading.
+func (rw *DowngradableRWMutex) DowngradeLock() {
+	if RaceEnabled {
+		RaceRelease(unsafe.Pointer(&rw.readerSem))
+		RaceDisable()
+	}
+	// Announce to readers there is no active writer and one additional reader.
+	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
+	if r >= rwmutexMaxReaders+1 {
+		panic("DowngradeLock of unlocked DowngradableRWMutex")
+	}
+	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
+	// includes this goroutine.
+	for i := 1; i < int(r); i++ {
+		runtimeSemrelease(&rw.readerSem, false, 0)
+	}
+	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
+	// block on rw.writerSem since at least this reader exists, such that
+	// DowngradeLock() is atomic with the previous write lock.
+	rw.w.Unlock()
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
diff --git a/pkg/syncutil/memmove_unsafe.go b/pkg/syncutil/memmove_unsafe.go
new file mode 100644
index 000000000..348675baa
--- /dev/null
+++ b/pkg/syncutil/memmove_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.12
+// +build !go1.15
+
+// Check go:linkname function signatures when updating Go version.
+
+package syncutil
+
+import (
+	"unsafe"
+)
+
+//go:linkname memmove runtime.memmove
+//go:noescape
+func memmove(to, from unsafe.Pointer, n uintptr)
+
+// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad<T>, which can't
+// define it because go_generics can't update the go:linkname annotation.
+// Furthermore, go:linkname silently doesn't work if the local name is exported
+// (this is of course undocumented), which is why this indirection is
+// necessary.
+func Memmove(to, from unsafe.Pointer, n uintptr) {
+	memmove(to, from, n)
+}
diff --git a/pkg/syncutil/norace_unsafe.go b/pkg/syncutil/norace_unsafe.go
new file mode 100644
index 000000000..0a0a9deda
--- /dev/null
+++ b/pkg/syncutil/norace_unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !race
+
+package syncutil
+
+import (
+	"unsafe"
+)
+
+// RaceEnabled is true if the Go data race detector is enabled.
+const RaceEnabled = false
+
+// RaceDisable has the same semantics as runtime.RaceDisable.
+func RaceDisable() {
+}
+
+// RaceEnable has the same semantics as runtime.RaceEnable.
+func RaceEnable() {
+}
+
+// RaceAcquire has the same semantics as runtime.RaceAcquire.
+func RaceAcquire(addr unsafe.Pointer) {
+}
+
+// RaceRelease has the same semantics as runtime.RaceRelease.
+func RaceRelease(addr unsafe.Pointer) {
+}
+
+// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
+func RaceReleaseMerge(addr unsafe.Pointer) {
+}
diff --git a/pkg/syncutil/race_unsafe.go b/pkg/syncutil/race_unsafe.go
new file mode 100644
index 000000000..206067ec1
--- /dev/null
+++ b/pkg/syncutil/race_unsafe.go
@@ -0,0 +1,41 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+package syncutil
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+// RaceEnabled is true if the Go data race detector is enabled.
+const RaceEnabled = true
+
+// RaceDisable has the same semantics as runtime.RaceDisable.
+func RaceDisable() {
+	runtime.RaceDisable()
+}
+
+// RaceEnable has the same semantics as runtime.RaceEnable.
+func RaceEnable() {
+	runtime.RaceEnable()
+}
+
+// RaceAcquire has the same semantics as runtime.RaceAcquire.
+func RaceAcquire(addr unsafe.Pointer) {
+	runtime.RaceAcquire(addr)
+}
+
+// RaceRelease has the same semantics as runtime.RaceRelease.
+func RaceRelease(addr unsafe.Pointer) {
+	runtime.RaceRelease(addr)
+}
+
+// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
+func RaceReleaseMerge(addr unsafe.Pointer) {
+	runtime.RaceReleaseMerge(addr)
+}
diff --git a/pkg/syncutil/seqatomic_unsafe.go b/pkg/syncutil/seqatomic_unsafe.go
new file mode 100644
index 000000000..cb6d2eb22
--- /dev/null
+++ b/pkg/syncutil/seqatomic_unsafe.go
@@ -0,0 +1,72 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package template doesn't exist. This file must be instantiated using the
+// go_template_instance rule in tools/go_generics/defs.bzl.
+package template
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/syncutil"
+)
+
+// Value is a required type parameter.
+//
+// Value must not contain any pointers, including interface objects, function
+// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs
+// containing any of the above. An init() function will panic if this property
+// does not hold.
+type Value struct{}
+
+// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
+// with any writer critical sections in sc.
+func SeqAtomicLoad(sc *syncutil.SeqCount, ptr *Value) Value {
+	// This function doesn't use SeqAtomicTryLoad because doing so is
+	// measurably, significantly (~20%) slower; Go is awful at inlining.
+	var val Value
+	for {
+		epoch := sc.BeginRead()
+		if syncutil.RaceEnabled {
+			// runtime.RaceDisable() doesn't actually stop the race detector,
+			// so it can't help us here. Instead, call runtime.memmove
+			// directly, which is not instrumented by the race detector.
+			syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+		} else {
+			// This is ~40% faster for short reads than going through memmove.
+			val = *ptr
+		}
+		if sc.ReadOk(epoch) {
+			break
+		}
+	}
+	return val
+}
+
+// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
+// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
+// would race with a writer critical section, SeqAtomicTryLoad returns
+// (unspecified, false).
+func SeqAtomicTryLoad(sc *syncutil.SeqCount, epoch syncutil.SeqCountEpoch, ptr *Value) (Value, bool) {
+	var val Value
+	if syncutil.RaceEnabled {
+		syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+	} else {
+		val = *ptr
+	}
+	return val, sc.ReadOk(epoch)
+}
+
+func init() {
+	var val Value
+	typ := reflect.TypeOf(val)
+	name := typ.Name()
+	if ptrs := syncutil.PointersInType(typ, name); len(ptrs) != 0 {
+		panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n")))
+	}
+}
diff --git a/pkg/syncutil/seqatomictest/BUILD b/pkg/syncutil/seqatomictest/BUILD
new file mode 100644
index 000000000..ba18f3238
--- /dev/null
+++ b/pkg/syncutil/seqatomictest/BUILD
@@ -0,0 +1,35 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "seqatomic_int",
+    out = "seqatomic_int_unsafe.go",
+    package = "seqatomic",
+    suffix = "Int",
+    template = "//pkg/syncutil:generic_seqatomic",
+    types = {
+        "Value": "int",
+    },
+)
+
+go_library(
+    name = "seqatomic",
+    srcs = ["seqatomic_int_unsafe.go"],
+    importpath = "gvisor.dev/gvisor/pkg/syncutil/seqatomic",
+    deps = [
+        "//pkg/syncutil",
+    ],
+)
+
+go_test(
+    name = "seqatomic_test",
+    size = "small",
+    srcs = ["seqatomic_test.go"],
+    embed = [":seqatomic"],
+    deps = [
+        "//pkg/syncutil",
+    ],
+)
diff --git a/pkg/syncutil/seqatomictest/seqatomic_test.go b/pkg/syncutil/seqatomictest/seqatomic_test.go
new file mode 100644
index 000000000..b0db44999
--- /dev/null
+++ b/pkg/syncutil/seqatomictest/seqatomic_test.go
@@ -0,0 +1,132 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seqatomic
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/syncutil"
+)
+
+func TestSeqAtomicLoadUncontended(t *testing.T) {
+	var seq syncutil.SeqCount
+	const want = 1
+	data := want
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicLoadAfterWrite(t *testing.T) {
+	var seq syncutil.SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	data = want
+	seq.EndWrite()
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicLoadDuringWrite(t *testing.T) {
+	var seq syncutil.SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	go func() {
+		time.Sleep(time.Second)
+		data = want
+		seq.EndWrite()
+	}()
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicTryLoadUncontended(t *testing.T) {
+	var seq syncutil.SeqCount
+	const want = 1
+	data := want
+	epoch := seq.BeginRead()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
+	}
+}
+
+func TestSeqAtomicTryLoadDuringWrite(t *testing.T) {
+	var seq syncutil.SeqCount
+	var data int
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
+	}
+	seq.EndWrite()
+}
+
+func TestSeqAtomicTryLoadAfterWrite(t *testing.T) {
+	var seq syncutil.SeqCount
+	var data int
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	seq.EndWrite()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
+	}
+}
+
+func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) {
+	var seq syncutil.SeqCount
+	const want = 42
+	data := want
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			if got := SeqAtomicLoadInt(&seq, &data); got != want {
+				b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+			}
+		}
+	})
+}
+
+func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) {
+	var seq syncutil.SeqCount
+	const want = 42
+	data := want
+	b.RunParallel(func(pb *testing.PB) {
+		epoch := seq.BeginRead()
+		for pb.Next() {
+			if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
+				b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
+			}
+		}
+	})
+}
+
+// For comparison:
+func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) {
+	var a atomic.Value
+	const want = 42
+	a.Store(int(want))
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			if got := a.Load().(int); got != want {
+				b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want)
+			}
+		}
+	})
+}
diff --git a/pkg/syncutil/seqcount.go b/pkg/syncutil/seqcount.go
new file mode 100644
index 000000000..11d8dbfaa
--- /dev/null
+++ b/pkg/syncutil/seqcount.go
@@ -0,0 +1,149 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syncutil
+
+import (
+	"fmt"
+	"reflect"
+	"runtime"
+	"sync/atomic"
+)
+
+// SeqCount is a synchronization primitive for optimistic reader/writer
+// synchronization in cases where readers can work with stale data and
+// therefore do not need to block writers.
+//
+// Compared to sync/atomic.Value:
+//
+// - Mutation of SeqCount-protected data does not require memory allocation,
+// whereas atomic.Value generally does. This is a significant advantage when
+// writes are common.
+//
+// - Atomic reads of SeqCount-protected data require copying. This is a
+// disadvantage when atomic reads are common.
+//
+// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other
+// operations to be made atomic with reads of SeqCount-protected data.
+//
+// - SeqCount may be less flexible: as of this writing, SeqCount-protected data
+// cannot include pointers.
+//
+// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected
+// data require instantiating function templates using go_generics (see
+// seqatomic.go).
+type SeqCount struct {
+	// epoch is incremented by BeginWrite and EndWrite, such that epoch is odd
+	// if a writer critical section is active, and a read from data protected
+	// by this SeqCount is atomic iff epoch is the same even value before and
+	// after the read.
+	epoch uint32
+}
+
+// SeqCountEpoch tracks writer critical sections in a SeqCount.
+type SeqCountEpoch struct {
+	val uint32
+}
+
+// We assume that:
+//
+// - All functions in sync/atomic that perform a memory read are at least a
+// read fence: memory reads before calls to such functions cannot be reordered
+// after the call, and memory reads after calls to such functions cannot be
+// reordered before the call, even if those reads do not use sync/atomic.
+//
+// - All functions in sync/atomic that perform a memory write are at least a
+// write fence: memory writes before calls to such functions cannot be
+// reordered after the call, and memory writes after calls to such functions
+// cannot be reordered before the call, even if those writes do not use
+// sync/atomic.
+//
+// As of this writing, the Go memory model completely fails to describe
+// sync/atomic, but these properties are implied by
+// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8.
+
+// BeginRead indicates the beginning of a reader critical section. Reader
+// critical sections DO NOT BLOCK writer critical sections, so operations in a
+// reader critical section MAY RACE with writer critical sections. Races are
+// detected by ReadOk at the end of the reader critical section. Thus, the
+// low-level structure of readers is generally:
+//
+//     for {
+//         epoch := seq.BeginRead()
+//         // do something idempotent with seq-protected data
+//         if seq.ReadOk(epoch) {
+//             break
+//         }
+//     }
+//
+// However, since reader critical sections may race with writer critical
+// sections, the Go race detector will (accurately) flag data races in readers
+// using this pattern. Most users of SeqCount will need to use the
+// SeqAtomicLoad function template in seqatomic.go.
+func (s *SeqCount) BeginRead() SeqCountEpoch {
+	epoch := atomic.LoadUint32(&s.epoch)
+	for epoch&1 != 0 {
+		runtime.Gosched()
+		epoch = atomic.LoadUint32(&s.epoch)
+	}
+	return SeqCountEpoch{epoch}
+}
+
+// ReadOk returns true if the reader critical section initiated by a previous
+// call to BeginRead() that returned epoch did not race with any writer critical
+// sections.
+//
+// ReadOk may be called any number of times during a reader critical section.
+// Reader critical sections do not need to be explicitly terminated; the last
+// call to ReadOk is implicitly the end of the reader critical section.
+func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
+	return atomic.LoadUint32(&s.epoch) == epoch.val
+}
+
+// BeginWrite indicates the beginning of a writer critical section.
+//
+// SeqCount does not support concurrent writer critical sections; clients with
+// concurrent writers must synchronize them using e.g. sync.Mutex.
+func (s *SeqCount) BeginWrite() {
+	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 {
+		panic("SeqCount.BeginWrite during writer critical section")
+	}
+}
+
+// EndWrite ends the effect of a preceding BeginWrite.
+func (s *SeqCount) EndWrite() {
+	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 {
+		panic("SeqCount.EndWrite outside writer critical section")
+	}
+}
+
+// PointersInType returns a list of pointers reachable from values named
+// valName of the given type.
+//
+// PointersInType is not exhaustive, but it is guaranteed that if typ contains
+// at least one pointer, then PointersInTypeOf returns a non-empty list.
+func PointersInType(typ reflect.Type, valName string) []string {
+	switch kind := typ.Kind(); kind {
+	case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+		return nil
+
+	case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer:
+		return []string{valName}
+
+	case reflect.Array:
+		return PointersInType(typ.Elem(), valName+"[]")
+
+	case reflect.Struct:
+		var ptrs []string
+		for i, n := 0, typ.NumField(); i < n; i++ {
+			field := typ.Field(i)
+			ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...)
+		}
+		return ptrs
+
+	default:
+		return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)}
+	}
+}
diff --git a/pkg/syncutil/seqcount_test.go b/pkg/syncutil/seqcount_test.go
new file mode 100644
index 000000000..14d6aedea
--- /dev/null
+++ b/pkg/syncutil/seqcount_test.go
@@ -0,0 +1,153 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syncutil
+
+import (
+	"reflect"
+	"testing"
+	"time"
+)
+
+func TestSeqCountWriteUncontended(t *testing.T) {
+	var seq SeqCount
+	seq.BeginWrite()
+	seq.EndWrite()
+}
+
+func TestSeqCountReadUncontended(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountBeginReadAfterWrite(t *testing.T) {
+	var seq SeqCount
+	var data int32
+	const want = 1
+	seq.BeginWrite()
+	data = want
+	seq.EndWrite()
+	epoch := seq.BeginRead()
+	if data != want {
+		t.Errorf("Reader: got %v, wanted %v", data, want)
+	}
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountBeginReadDuringWrite(t *testing.T) {
+	var seq SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	go func() {
+		time.Sleep(time.Second)
+		data = want
+		seq.EndWrite()
+	}()
+	epoch := seq.BeginRead()
+	if data != want {
+		t.Errorf("Reader: got %v, wanted %v", data, want)
+	}
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountReadOkAfterWrite(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	seq.EndWrite()
+	if seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got true, wanted false")
+	}
+}
+
+func TestSeqCountReadOkDuringWrite(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	if seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got true, wanted false")
+	}
+	seq.EndWrite()
+}
+
+func BenchmarkSeqCountWriteUncontended(b *testing.B) {
+	var seq SeqCount
+	for i := 0; i < b.N; i++ {
+		seq.BeginWrite()
+		seq.EndWrite()
+	}
+}
+
+func BenchmarkSeqCountReadUncontended(b *testing.B) {
+	var seq SeqCount
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			epoch := seq.BeginRead()
+			if !seq.ReadOk(epoch) {
+				b.Fatalf("ReadOk: got false, wanted true")
+			}
+		}
+	})
+}
+
+func TestPointersInType(t *testing.T) {
+	for _, test := range []struct {
+		name string // used for both test and value name
+		val  interface{}
+		ptrs []string
+	}{
+		{
+			name: "EmptyStruct",
+			val:  struct{}{},
+		},
+		{
+			name: "Int",
+			val:  int(0),
+		},
+		{
+			name: "MixedStruct",
+			val: struct {
+				b             bool
+				I             int
+				ExportedPtr   *struct{}
+				unexportedPtr *struct{}
+				arr           [2]int
+				ptrArr        [2]*int
+				nestedStruct  struct {
+					nestedNonptr int
+					nestedPtr    *int
+				}
+				structArr [1]struct {
+					nonptr int
+					ptr    *int
+				}
+			}{},
+			ptrs: []string{
+				"MixedStruct.ExportedPtr",
+				"MixedStruct.unexportedPtr",
+				"MixedStruct.ptrArr[]",
+				"MixedStruct.nestedStruct.nestedPtr",
+				"MixedStruct.structArr[].ptr",
+			},
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			typ := reflect.TypeOf(test.val)
+			ptrs := PointersInType(typ, test.name)
+			t.Logf("Found pointers: %v", ptrs)
+			if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) {
+				t.Errorf("Got %v, wanted %v", ptrs, test.ptrs)
+			}
+		})
+	}
+}
diff --git a/pkg/syncutil/syncutil.go b/pkg/syncutil/syncutil.go
new file mode 100644
index 000000000..66e750d06
--- /dev/null
+++ b/pkg/syncutil/syncutil.go
@@ -0,0 +1,7 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package syncutil provides synchronization primitives.
+package syncutil
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index 427c42ede..e08c578f0 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -14,8 +14,10 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/accept_bind_stream.cc b/test/syscalls/linux/accept_bind_stream.cc
index 7bcd91e9e..4857f160b 100644
--- a/test/syscalls/linux/accept_bind_stream.cc
+++ b/test/syscalls/linux/accept_bind_stream.cc
@@ -14,8 +14,10 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/chmod.cc b/test/syscalls/linux/chmod.cc
index 7e918b9b2..a06b5cfd6 100644
--- a/test/syscalls/linux/chmod.cc
+++ b/test/syscalls/linux/chmod.cc
@@ -16,6 +16,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+
 #include <string>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index de1611c21..04bc2d7b9 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -19,6 +19,7 @@
 #include <sys/stat.h>
 #include <syscall.h>
 #include <unistd.h>
+
 #include <string>
 #include <vector>
 
diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc
index c9e3ed6b2..2aa91691e 100644
--- a/test/syscalls/linux/clock_gettime.cc
+++ b/test/syscalls/linux/clock_gettime.cc
@@ -14,6 +14,7 @@
 
 #include <pthread.h>
 #include <sys/time.h>
+
 #include <cerrno>
 #include <cstdint>
 #include <ctime>
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
index 4e0a13f8b..00b96b34a 100644
--- a/test/syscalls/linux/concurrency.cc
+++ b/test/syscalls/linux/concurrency.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <signal.h>
+
 #include <atomic>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 0a3931e5a..736452b0c 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -20,6 +20,7 @@
 #include <sys/types.h>
 #include <sys/user.h>
 #include <unistd.h>
+
 #include <algorithm>
 #include <functional>
 #include <iterator>
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 4d155b618..4e048320e 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -27,6 +27,7 @@
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <unistd.h>
+
 #include <cstring>
 #include <string>
 
diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc
index b4a91455d..3ecb8db8e 100644
--- a/test/syscalls/linux/flock.cc
+++ b/test/syscalls/linux/flock.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <sys/file.h>
+
 #include <string>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index dd6e1a422..371890110 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -20,6 +20,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+
 #include <atomic>
 #include <cstdlib>
 
diff --git a/test/syscalls/linux/getdents.cc b/test/syscalls/linux/getdents.cc
index fe9cfafe8..ad2dbacb8 100644
--- a/test/syscalls/linux/getdents.cc
+++ b/test/syscalls/linux/getdents.cc
@@ -23,6 +23,7 @@
 #include <sys/types.h>
 #include <syscall.h>
 #include <unistd.h>
+
 #include <map>
 #include <string>
 #include <unordered_map>
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 57e99596f..8398fc95f 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "test/syscalls/linux/ip_socket_test_util.h"
+
 #include <net/if.h>
 #include <netinet/in.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
-#include <cstring>
 
-#include "test/syscalls/linux/ip_socket_test_util.h"
+#include <cstring>
 
 namespace gvisor {
 namespace testing {
diff --git a/test/syscalls/linux/memory_accounting.cc b/test/syscalls/linux/memory_accounting.cc
index ff2f49863..94aea4077 100644
--- a/test/syscalls/linux/memory_accounting.cc
+++ b/test/syscalls/linux/memory_accounting.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <sys/mman.h>
+
 #include <map>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
index 283c21ed3..620b4f8b4 100644
--- a/test/syscalls/linux/mlock.cc
+++ b/test/syscalls/linux/mlock.cc
@@ -16,6 +16,7 @@
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <unistd.h>
+
 #include <cerrno>
 #include <cstring>
 
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index a112316e9..6f2639d8a 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -28,6 +28,7 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
+
 #include <vector>
 
 #include "gmock/gmock.h"
diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc
index e35be3cab..a3e9745cf 100644
--- a/test/syscalls/linux/mount.cc
+++ b/test/syscalls/linux/mount.cc
@@ -18,6 +18,7 @@
 #include <sys/mount.h>
 #include <sys/stat.h>
 #include <unistd.h>
+
 #include <functional>
 #include <memory>
 #include <string>
diff --git a/test/syscalls/linux/read.cc b/test/syscalls/linux/read.cc
index 4430fa3c2..2633ba31b 100644
--- a/test/syscalls/linux/read.cc
+++ b/test/syscalls/linux/read.cc
@@ -14,6 +14,7 @@
 
 #include <fcntl.h>
 #include <unistd.h>
+
 #include <vector>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc
index 5b474ff32..833c0dc4f 100644
--- a/test/syscalls/linux/rename.cc
+++ b/test/syscalls/linux/rename.cc
@@ -14,6 +14,7 @@
 
 #include <fcntl.h>
 #include <stdio.h>
+
 #include <string>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index e77586852..7e41fe7d8 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -25,6 +25,7 @@
 #include <time.h>
 #include <ucontext.h>
 #include <unistd.h>
+
 #include <atomic>
 
 #include "gmock/gmock.h"
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index e06a2666d..424e2a67f 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -16,6 +16,7 @@
 #include <sys/resource.h>
 #include <sys/select.h>
 #include <sys/time.h>
+
 #include <climits>
 #include <csignal>
 #include <cstdio>
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
index eb7a3966f..7ba752599 100644
--- a/test/syscalls/linux/shm.cc
+++ b/test/syscalls/linux/shm.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include <stdio.h>
-
 #include <sys/ipc.h>
 #include <sys/mman.h>
 #include <sys/shm.h>
diff --git a/test/syscalls/linux/socket_blocking.cc b/test/syscalls/linux/socket_blocking.cc
index d7ce57566..7e88aa2d9 100644
--- a/test/syscalls/linux/socket_blocking.cc
+++ b/test/syscalls/linux/socket_blocking.cc
@@ -17,6 +17,7 @@
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <sys/un.h>
+
 #include <cstdio>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc
index d7fc9715b..e58eedaba 100644
--- a/test/syscalls/linux/socket_ip_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <netinet/tcp.h>
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
index 0dc274e2d..d11f7cc23 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <netinet/tcp.h>
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
index cd3ad97d0..fcd20102f 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <netinet/tcp.h>
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
index 1acdecc17..63a05b799 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <netinet/tcp.h>
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
index 3c3712b50..80f12b0a9 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
@@ -18,6 +18,7 @@
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <sys/un.h>
+
 #include <cstdio>
 #include <cstring>
 
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
index 92f03e045..3ac790873 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h"
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
-#include "test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
 
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
index 9d4e1ab97..8f47952b0 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h"
+
 #include <vector>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
-#include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
 
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index 5f05bab10..723f5d728 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <sys/socket.h>
+#include "test/syscalls/linux/socket_netlink_util.h"
 
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <sys/socket.h>
 
 #include <vector>
 
 #include "absl/strings/str_cat.h"
-#include "test/syscalls/linux/socket_netlink_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 
 namespace gvisor {
diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc
index 1994139e6..6f84221b2 100644
--- a/test/syscalls/linux/socket_unix_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_blocking_local.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "test/syscalls/linux/socket_blocking.h"
-
 #include <vector>
 
+#include "test/syscalls/linux/socket_blocking.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc
index 3245cf7c9..af0df4fb4 100644
--- a/test/syscalls/linux/socket_unix_dgram.cc
+++ b/test/syscalls/linux/socket_unix_dgram.cc
@@ -16,6 +16,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
index cd4fba25c..2db8b68d3 100644
--- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -14,6 +14,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
index da762cd83..8855d5001 100644
--- a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "test/syscalls/linux/socket_non_stream_blocking.h"
-
 #include <vector>
 
+#include "test/syscalls/linux/socket_non_stream_blocking.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_seqpacket.cc b/test/syscalls/linux/socket_unix_seqpacket.cc
index 60fa9e38a..84d3a569e 100644
--- a/test/syscalls/linux/socket_unix_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket.cc
@@ -16,6 +16,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
index fa0a9d367..08e579ba7 100644
--- a/test/syscalls/linux/socket_unix_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "test/syscalls/linux/socket_stream_blocking.h"
-
 #include <vector>
 
+#include "test/syscalls/linux/socket_stream_blocking.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
index ec777c59f..1936aa135 100644
--- a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
@@ -11,10 +11,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "test/syscalls/linux/socket_stream_nonblock.h"
-
 #include <vector>
 
+#include "test/syscalls/linux/socket_stream_nonblock.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_abstract.cc b/test/syscalls/linux/socket_unix_unbound_abstract.cc
index 7f5816ace..8b1762000 100644
--- a/test/syscalls/linux/socket_unix_unbound_abstract.cc
+++ b/test/syscalls/linux/socket_unix_unbound_abstract.cc
@@ -14,6 +14,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_filesystem.cc b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
index b14f24086..cab912152 100644
--- a/test/syscalls/linux/socket_unix_unbound_filesystem.cc
+++ b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
@@ -14,6 +14,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
index 50ffa1d04..cb99030f5 100644
--- a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
@@ -14,6 +14,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc
index 344918c34..f185dded3 100644
--- a/test/syscalls/linux/socket_unix_unbound_stream.cc
+++ b/test/syscalls/linux/socket_unix_unbound_stream.cc
@@ -14,6 +14,7 @@
 
 #include <stdio.h>
 #include <sys/un.h>
+
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
diff --git a/test/syscalls/linux/sync.cc b/test/syscalls/linux/sync.cc
index fe479390d..8aa2525a9 100644
--- a/test/syscalls/linux/sync.cc
+++ b/test/syscalls/linux/sync.cc
@@ -14,10 +14,9 @@
 
 #include <fcntl.h>
 #include <stdio.h>
-#include <unistd.h>
-
 #include <sys/syscall.h>
 #include <unistd.h>
+
 #include <string>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/truncate.cc b/test/syscalls/linux/truncate.cc
index e5cc5d97c..c988c6380 100644
--- a/test/syscalls/linux/truncate.cc
+++ b/test/syscalls/linux/truncate.cc
@@ -19,6 +19,7 @@
 #include <sys/vfs.h>
 #include <time.h>
 #include <unistd.h>
+
 #include <iostream>
 #include <string>
 
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.cc b/test/syscalls/linux/unix_domain_socket_test_util.cc
index 7fb9eed8d..b05ab2900 100644
--- a/test/syscalls/linux/unix_domain_socket_test_util.cc
+++ b/test/syscalls/linux/unix_domain_socket_test_util.cc
@@ -15,6 +15,7 @@
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 
 #include <sys/un.h>
+
 #include <vector>
 
 #include "gtest/gtest.h"
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.h b/test/syscalls/linux/unix_domain_socket_test_util.h
index 5eca0b7f0..b8073db17 100644
--- a/test/syscalls/linux/unix_domain_socket_test_util.h
+++ b/test/syscalls/linux/unix_domain_socket_test_util.h
@@ -16,6 +16,7 @@
 #define GVISOR_TEST_SYSCALLS_UNIX_DOMAIN_SOCKET_TEST_UTIL_H_
 
 #include <string>
+
 #include "test/syscalls/linux/socket_test_util.h"
 
 namespace gvisor {
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
index 80716859a..12b925a51 100644
--- a/test/syscalls/linux/utimes.cc
+++ b/test/syscalls/linux/utimes.cc
@@ -20,6 +20,7 @@
 #include <time.h>
 #include <unistd.h>
 #include <utime.h>
+
 #include <string>
 
 #include "absl/time/time.h"
diff --git a/test/syscalls/linux/vdso_clock_gettime.cc b/test/syscalls/linux/vdso_clock_gettime.cc
index 40c0014b9..ce1899f45 100644
--- a/test/syscalls/linux/vdso_clock_gettime.cc
+++ b/test/syscalls/linux/vdso_clock_gettime.cc
@@ -17,6 +17,7 @@
 #include <syscall.h>
 #include <time.h>
 #include <unistd.h>
+
 #include <map>
 #include <string>
 #include <utility>
diff --git a/test/util/fs_util_test.cc b/test/util/fs_util_test.cc
index 2a200320a..657b6a46e 100644
--- a/test/util/fs_util_test.cc
+++ b/test/util/fs_util_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "test/util/fs_util.h"
+
 #include <errno.h>
+
 #include <vector>
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "test/util/fs_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
diff --git a/test/util/mount_util.h b/test/util/mount_util.h
index 38ec6c8a1..484de560e 100644
--- a/test/util/mount_util.h
+++ b/test/util/mount_util.h
@@ -17,6 +17,7 @@
 
 #include <errno.h>
 #include <sys/mount.h>
+
 #include <functional>
 #include <string>
 
diff --git a/test/util/posix_error_test.cc b/test/util/posix_error_test.cc
index d67270842..bf9465abb 100644
--- a/test/util/posix_error_test.cc
+++ b/test/util/posix_error_test.cc
@@ -15,6 +15,7 @@
 #include "test/util/posix_error.h"
 
 #include <errno.h>
+
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
diff --git a/test/util/rlimit_util.cc b/test/util/rlimit_util.cc
index 684253f78..d7bfc1606 100644
--- a/test/util/rlimit_util.cc
+++ b/test/util/rlimit_util.cc
@@ -15,6 +15,7 @@
 #include "test/util/rlimit_util.h"
 
 #include <sys/resource.h>
+
 #include <cerrno>
 
 #include "test/util/cleanup.h"
diff --git a/test/util/signal_util.cc b/test/util/signal_util.cc
index 26738864f..5ee95ee80 100644
--- a/test/util/signal_util.cc
+++ b/test/util/signal_util.cc
@@ -15,6 +15,7 @@
 #include "test/util/signal_util.h"
 
 #include <signal.h>
+
 #include <ostream>
 
 #include "gtest/gtest.h"
diff --git a/test/util/signal_util.h b/test/util/signal_util.h
index 7fd2af015..bcf85c337 100644
--- a/test/util/signal_util.h
+++ b/test/util/signal_util.h
@@ -18,6 +18,7 @@
 #include <signal.h>
 #include <sys/syscall.h>
 #include <unistd.h>
+
 #include <ostream>
 
 #include "gmock/gmock.h"
diff --git a/test/util/temp_path.h b/test/util/temp_path.h
index 92d669503..9e5ac11f4 100644
--- a/test/util/temp_path.h
+++ b/test/util/temp_path.h
@@ -16,6 +16,7 @@
 #define GVISOR_TEST_UTIL_TEMP_PATH_H_
 
 #include <sys/stat.h>
+
 #include <string>
 #include <utility>
 
diff --git a/test/util/test_util_test.cc b/test/util/test_util_test.cc
index b7300d9e5..f42100374 100644
--- a/test/util/test_util_test.cc
+++ b/test/util/test_util_test.cc
@@ -15,6 +15,7 @@
 #include "test/util/test_util.h"
 
 #include <errno.h>
+
 #include <vector>
 
 #include "gmock/gmock.h"
diff --git a/third_party/gvsync/BUILD b/third_party/gvsync/BUILD
deleted file mode 100644
index 7d6d59c48..000000000
--- a/third_party/gvsync/BUILD
+++ /dev/null
@@ -1,53 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template")
-
-package(
-    default_visibility = ["//:sandbox"],
-    licenses = ["notice"],
-)
-
-exports_files(["LICENSE"])
-
-go_template(
-    name = "generic_atomicptr",
-    srcs = ["atomicptr_unsafe.go"],
-    types = [
-        "Value",
-    ],
-)
-
-go_template(
-    name = "generic_seqatomic",
-    srcs = ["seqatomic_unsafe.go"],
-    types = [
-        "Value",
-    ],
-    deps = [
-        ":sync",
-    ],
-)
-
-go_library(
-    name = "gvsync",
-    srcs = [
-        "downgradable_rwmutex_1_12_unsafe.go",
-        "downgradable_rwmutex_1_13_unsafe.go",
-        "downgradable_rwmutex_unsafe.go",
-        "gvsync.go",
-        "memmove_unsafe.go",
-        "norace_unsafe.go",
-        "race_unsafe.go",
-        "seqcount.go",
-    ],
-    importpath = "gvisor.dev/gvisor/third_party/gvsync",
-)
-
-go_test(
-    name = "gvsync_test",
-    size = "small",
-    srcs = [
-        "downgradable_rwmutex_test.go",
-        "seqcount_test.go",
-    ],
-    embed = [":gvsync"],
-)
diff --git a/third_party/gvsync/LICENSE b/third_party/gvsync/LICENSE
deleted file mode 100644
index 6a66aea5e..000000000
--- a/third_party/gvsync/LICENSE
+++ /dev/null
@@ -1,27 +0,0 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/gvsync/README.md b/third_party/gvsync/README.md
deleted file mode 100644
index fcc7e6f44..000000000
--- a/third_party/gvsync/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This package provides additional synchronization primitives not provided by the
-Go stdlib 'sync' package. It is partially derived from the upstream 'sync'
-package.
diff --git a/third_party/gvsync/atomicptr_unsafe.go b/third_party/gvsync/atomicptr_unsafe.go
deleted file mode 100644
index 525c4beed..000000000
--- a/third_party/gvsync/atomicptr_unsafe.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package template doesn't exist. This file must be instantiated using the
-// go_template_instance rule in tools/go_generics/defs.bzl.
-package template
-
-import (
-	"sync/atomic"
-	"unsafe"
-)
-
-// Value is a required type parameter.
-type Value struct{}
-
-// An AtomicPtr is a pointer to a value of type Value that can be atomically
-// loaded and stored. The zero value of an AtomicPtr represents nil.
-//
-// Note that copying AtomicPtr by value performs a non-atomic read of the
-// stored pointer, which is unsafe if Store() can be called concurrently; in
-// this case, do `dst.Store(src.Load())` instead.
-//
-// +stateify savable
-type AtomicPtr struct {
-	ptr unsafe.Pointer `state:".(*Value)"`
-}
-
-func (p *AtomicPtr) savePtr() *Value {
-	return p.Load()
-}
-
-func (p *AtomicPtr) loadPtr(v *Value) {
-	p.Store(v)
-}
-
-// Load returns the value set by the most recent Store. It returns nil if there
-// has been no previous call to Store.
-func (p *AtomicPtr) Load() *Value {
-	return (*Value)(atomic.LoadPointer(&p.ptr))
-}
-
-// Store sets the value returned by Load to x.
-func (p *AtomicPtr) Store(x *Value) {
-	atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x))
-}
diff --git a/third_party/gvsync/atomicptrtest/BUILD b/third_party/gvsync/atomicptrtest/BUILD
deleted file mode 100644
index 447ecf96a..000000000
--- a/third_party/gvsync/atomicptrtest/BUILD
+++ /dev/null
@@ -1,28 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-package(licenses = ["notice"])
-
-go_template_instance(
-    name = "atomicptr_int",
-    out = "atomicptr_int_unsafe.go",
-    package = "atomicptr",
-    suffix = "Int",
-    template = "//third_party/gvsync:generic_atomicptr",
-    types = {
-        "Value": "int",
-    },
-)
-
-go_library(
-    name = "atomicptr",
-    srcs = ["atomicptr_int_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/third_party/gvsync/atomicptr",
-)
-
-go_test(
-    name = "atomicptr_test",
-    size = "small",
-    srcs = ["atomicptr_test.go"],
-    embed = [":atomicptr"],
-)
diff --git a/third_party/gvsync/atomicptrtest/atomicptr_test.go b/third_party/gvsync/atomicptrtest/atomicptr_test.go
deleted file mode 100644
index 8fdc5112e..000000000
--- a/third_party/gvsync/atomicptrtest/atomicptr_test.go
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package atomicptr
-
-import (
-	"testing"
-)
-
-func newInt(val int) *int {
-	return &val
-}
-
-func TestAtomicPtr(t *testing.T) {
-	var p AtomicPtrInt
-	if got := p.Load(); got != nil {
-		t.Errorf("initial value is %p (%v), wanted nil", got, got)
-	}
-	want := newInt(42)
-	p.Store(want)
-	if got := p.Load(); got != want {
-		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
-	}
-	want = newInt(100)
-	p.Store(want)
-	if got := p.Load(); got != want {
-		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
-	}
-}
diff --git a/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go b/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go
deleted file mode 100644
index 855b2a2b1..000000000
--- a/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.12
-// +build !go1.13
-
-// TODO(b/133868570): Delete once Go 1.12 is no longer supported.
-
-package gvsync
-
-import _ "unsafe"
-
-//go:linkname runtimeSemrelease112 sync.runtime_Semrelease
-func runtimeSemrelease112(s *uint32, handoff bool)
-
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int) {
-	// 'skipframes' is only available starting from 1.13.
-	runtimeSemrelease112(s, handoff)
-}
diff --git a/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go b/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go
deleted file mode 100644
index 3b9346843..000000000
--- a/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.13
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-package gvsync
-
-import _ "unsafe"
-
-//go:linkname runtimeSemrelease sync.runtime_Semrelease
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
diff --git a/third_party/gvsync/downgradable_rwmutex_test.go b/third_party/gvsync/downgradable_rwmutex_test.go
deleted file mode 100644
index 40c384b8b..000000000
--- a/third_party/gvsync/downgradable_rwmutex_test.go
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// GOMAXPROCS=10 go test
-
-// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the
-// addition of downgradingWriter and the renaming of num_iterations to
-// numIterations to shut up Golint.
-
-package gvsync
-
-import (
-	"fmt"
-	"runtime"
-	"sync/atomic"
-	"testing"
-)
-
-func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) {
-	m.RLock()
-	clocked <- true
-	<-cunlock
-	m.RUnlock()
-	cdone <- true
-}
-
-func doTestParallelReaders(numReaders, gomaxprocs int) {
-	runtime.GOMAXPROCS(gomaxprocs)
-	var m DowngradableRWMutex
-	clocked := make(chan bool)
-	cunlock := make(chan bool)
-	cdone := make(chan bool)
-	for i := 0; i < numReaders; i++ {
-		go parallelReader(&m, clocked, cunlock, cdone)
-	}
-	// Wait for all parallel RLock()s to succeed.
-	for i := 0; i < numReaders; i++ {
-		<-clocked
-	}
-	for i := 0; i < numReaders; i++ {
-		cunlock <- true
-	}
-	// Wait for the goroutines to finish.
-	for i := 0; i < numReaders; i++ {
-		<-cdone
-	}
-}
-
-func TestParallelReaders(t *testing.T) {
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
-	doTestParallelReaders(1, 4)
-	doTestParallelReaders(3, 4)
-	doTestParallelReaders(4, 2)
-}
-
-func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.RLock()
-		n := atomic.AddInt32(activity, 1)
-		if n < 1 || n >= 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -1)
-		rwm.RUnlock()
-	}
-	cdone <- true
-}
-
-func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.Lock()
-		n := atomic.AddInt32(activity, 10000)
-		if n != 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -10000)
-		rwm.Unlock()
-	}
-	cdone <- true
-}
-
-func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.Lock()
-		n := atomic.AddInt32(activity, 10000)
-		if n != 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -10000)
-		rwm.DowngradeLock()
-		n = atomic.AddInt32(activity, 1)
-		if n < 1 || n >= 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		n = atomic.AddInt32(activity, -1)
-		rwm.RUnlock()
-	}
-	cdone <- true
-}
-
-func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) {
-	runtime.GOMAXPROCS(gomaxprocs)
-	// Number of active readers + 10000 * number of active writers.
-	var activity int32
-	var rwm DowngradableRWMutex
-	cdone := make(chan bool)
-	go writer(&rwm, numIterations, &activity, cdone)
-	go downgradingWriter(&rwm, numIterations, &activity, cdone)
-	var i int
-	for i = 0; i < numReaders/2; i++ {
-		go reader(&rwm, numIterations, &activity, cdone)
-	}
-	go writer(&rwm, numIterations, &activity, cdone)
-	go downgradingWriter(&rwm, numIterations, &activity, cdone)
-	for ; i < numReaders; i++ {
-		go reader(&rwm, numIterations, &activity, cdone)
-	}
-	// Wait for the 4 writers and all readers to finish.
-	for i := 0; i < 4+numReaders; i++ {
-		<-cdone
-	}
-}
-
-func TestDowngradableRWMutex(t *testing.T) {
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
-	n := 1000
-	if testing.Short() {
-		n = 5
-	}
-	HammerDowngradableRWMutex(1, 1, n)
-	HammerDowngradableRWMutex(1, 3, n)
-	HammerDowngradableRWMutex(1, 10, n)
-	HammerDowngradableRWMutex(4, 1, n)
-	HammerDowngradableRWMutex(4, 3, n)
-	HammerDowngradableRWMutex(4, 10, n)
-	HammerDowngradableRWMutex(10, 1, n)
-	HammerDowngradableRWMutex(10, 3, n)
-	HammerDowngradableRWMutex(10, 10, n)
-	HammerDowngradableRWMutex(10, 5, n)
-}
diff --git a/third_party/gvsync/downgradable_rwmutex_unsafe.go b/third_party/gvsync/downgradable_rwmutex_unsafe.go
deleted file mode 100644
index b7862d185..000000000
--- a/third_party/gvsync/downgradable_rwmutex_unsafe.go
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.12
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-// This is mostly copied from the standard library's sync/rwmutex.go.
-//
-// Happens-before relationships indicated to the race detector:
-// - Unlock -> Lock (via writerSem)
-// - Unlock -> RLock (via readerSem)
-// - RUnlock -> Lock (via writerSem)
-// - DowngradeLock -> RLock (via readerSem)
-
-package gvsync
-
-import (
-	"sync"
-	"sync/atomic"
-	"unsafe"
-)
-
-//go:linkname runtimeSemacquire sync.runtime_Semacquire
-func runtimeSemacquire(s *uint32)
-
-// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
-// method.
-type DowngradableRWMutex struct {
-	w           sync.Mutex // held if there are pending writers
-	writerSem   uint32     // semaphore for writers to wait for completing readers
-	readerSem   uint32     // semaphore for readers to wait for completing writers
-	readerCount int32      // number of pending readers
-	readerWait  int32      // number of departing readers
-}
-
-const rwmutexMaxReaders = 1 << 30
-
-// RLock locks rw for reading.
-func (rw *DowngradableRWMutex) RLock() {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
-		// A writer is pending, wait for it.
-		runtimeSemacquire(&rw.readerSem)
-	}
-	if RaceEnabled {
-		RaceEnable()
-		RaceAcquire(unsafe.Pointer(&rw.readerSem))
-	}
-}
-
-// RUnlock undoes a single RLock call.
-func (rw *DowngradableRWMutex) RUnlock() {
-	if RaceEnabled {
-		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
-		RaceDisable()
-	}
-	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
-		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
-			panic("RUnlock of unlocked DowngradableRWMutex")
-		}
-		// A writer is pending.
-		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
-			// The last reader unblocks the writer.
-			runtimeSemrelease(&rw.writerSem, false, 0)
-		}
-	}
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
-
-// Lock locks rw for writing.
-func (rw *DowngradableRWMutex) Lock() {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	// First, resolve competition with other writers.
-	rw.w.Lock()
-	// Announce to readers there is a pending writer.
-	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
-	// Wait for active readers.
-	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
-		runtimeSemacquire(&rw.writerSem)
-	}
-	if RaceEnabled {
-		RaceEnable()
-		RaceAcquire(unsafe.Pointer(&rw.writerSem))
-	}
-}
-
-// Unlock unlocks rw for writing.
-func (rw *DowngradableRWMutex) Unlock() {
-	if RaceEnabled {
-		RaceRelease(unsafe.Pointer(&rw.writerSem))
-		RaceRelease(unsafe.Pointer(&rw.readerSem))
-		RaceDisable()
-	}
-	// Announce to readers there is no active writer.
-	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
-	if r >= rwmutexMaxReaders {
-		panic("Unlock of unlocked DowngradableRWMutex")
-	}
-	// Unblock blocked readers, if any.
-	for i := 0; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false, 0)
-	}
-	// Allow other writers to proceed.
-	rw.w.Unlock()
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
-
-// DowngradeLock atomically unlocks rw for writing and locks it for reading.
-func (rw *DowngradableRWMutex) DowngradeLock() {
-	if RaceEnabled {
-		RaceRelease(unsafe.Pointer(&rw.readerSem))
-		RaceDisable()
-	}
-	// Announce to readers there is no active writer and one additional reader.
-	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
-	if r >= rwmutexMaxReaders+1 {
-		panic("DowngradeLock of unlocked DowngradableRWMutex")
-	}
-	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
-	// includes this goroutine.
-	for i := 1; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false, 0)
-	}
-	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
-	// block on rw.writerSem since at least this reader exists, such that
-	// DowngradeLock() is atomic with the previous write lock.
-	rw.w.Unlock()
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
diff --git a/third_party/gvsync/gvsync.go b/third_party/gvsync/gvsync.go
deleted file mode 100644
index 3bbef13c3..000000000
--- a/third_party/gvsync/gvsync.go
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package gvsync provides synchronization primitives.
-package gvsync
diff --git a/third_party/gvsync/memmove_unsafe.go b/third_party/gvsync/memmove_unsafe.go
deleted file mode 100644
index 9dd1d6142..000000000
--- a/third_party/gvsync/memmove_unsafe.go
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.12
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-package gvsync
-
-import (
-	"unsafe"
-)
-
-//go:linkname memmove runtime.memmove
-//go:noescape
-func memmove(to, from unsafe.Pointer, n uintptr)
-
-// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad<T>, which can't
-// define it because go_generics can't update the go:linkname annotation.
-// Furthermore, go:linkname silently doesn't work if the local name is exported
-// (this is of course undocumented), which is why this indirection is
-// necessary.
-func Memmove(to, from unsafe.Pointer, n uintptr) {
-	memmove(to, from, n)
-}
diff --git a/third_party/gvsync/norace_unsafe.go b/third_party/gvsync/norace_unsafe.go
deleted file mode 100644
index e3852db8c..000000000
--- a/third_party/gvsync/norace_unsafe.go
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !race
-
-package gvsync
-
-import (
-	"unsafe"
-)
-
-// RaceEnabled is true if the Go data race detector is enabled.
-const RaceEnabled = false
-
-// RaceDisable has the same semantics as runtime.RaceDisable.
-func RaceDisable() {
-}
-
-// RaceEnable has the same semantics as runtime.RaceEnable.
-func RaceEnable() {
-}
-
-// RaceAcquire has the same semantics as runtime.RaceAcquire.
-func RaceAcquire(addr unsafe.Pointer) {
-}
-
-// RaceRelease has the same semantics as runtime.RaceRelease.
-func RaceRelease(addr unsafe.Pointer) {
-}
-
-// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
-func RaceReleaseMerge(addr unsafe.Pointer) {
-}
diff --git a/third_party/gvsync/race_unsafe.go b/third_party/gvsync/race_unsafe.go
deleted file mode 100644
index 13c02a830..000000000
--- a/third_party/gvsync/race_unsafe.go
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build race
-
-package gvsync
-
-import (
-	"runtime"
-	"unsafe"
-)
-
-// RaceEnabled is true if the Go data race detector is enabled.
-const RaceEnabled = true
-
-// RaceDisable has the same semantics as runtime.RaceDisable.
-func RaceDisable() {
-	runtime.RaceDisable()
-}
-
-// RaceEnable has the same semantics as runtime.RaceEnable.
-func RaceEnable() {
-	runtime.RaceEnable()
-}
-
-// RaceAcquire has the same semantics as runtime.RaceAcquire.
-func RaceAcquire(addr unsafe.Pointer) {
-	runtime.RaceAcquire(addr)
-}
-
-// RaceRelease has the same semantics as runtime.RaceRelease.
-func RaceRelease(addr unsafe.Pointer) {
-	runtime.RaceRelease(addr)
-}
-
-// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
-func RaceReleaseMerge(addr unsafe.Pointer) {
-	runtime.RaceReleaseMerge(addr)
-}
diff --git a/third_party/gvsync/seqatomic_unsafe.go b/third_party/gvsync/seqatomic_unsafe.go
deleted file mode 100644
index 382eeed43..000000000
--- a/third_party/gvsync/seqatomic_unsafe.go
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package template doesn't exist. This file must be instantiated using the
-// go_template_instance rule in tools/go_generics/defs.bzl.
-package template
-
-import (
-	"fmt"
-	"reflect"
-	"strings"
-	"unsafe"
-
-	"gvisor.dev/gvisor/third_party/gvsync"
-)
-
-// Value is a required type parameter.
-//
-// Value must not contain any pointers, including interface objects, function
-// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs
-// containing any of the above. An init() function will panic if this property
-// does not hold.
-type Value struct{}
-
-// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
-// with any writer critical sections in sc.
-func SeqAtomicLoad(sc *gvsync.SeqCount, ptr *Value) Value {
-	// This function doesn't use SeqAtomicTryLoad because doing so is
-	// measurably, significantly (~20%) slower; Go is awful at inlining.
-	var val Value
-	for {
-		epoch := sc.BeginRead()
-		if gvsync.RaceEnabled {
-			// runtime.RaceDisable() doesn't actually stop the race detector,
-			// so it can't help us here. Instead, call runtime.memmove
-			// directly, which is not instrumented by the race detector.
-			gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
-		} else {
-			// This is ~40% faster for short reads than going through memmove.
-			val = *ptr
-		}
-		if sc.ReadOk(epoch) {
-			break
-		}
-	}
-	return val
-}
-
-// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
-// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
-// would race with a writer critical section, SeqAtomicTryLoad returns
-// (unspecified, false).
-func SeqAtomicTryLoad(sc *gvsync.SeqCount, epoch gvsync.SeqCountEpoch, ptr *Value) (Value, bool) {
-	var val Value
-	if gvsync.RaceEnabled {
-		gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
-	} else {
-		val = *ptr
-	}
-	return val, sc.ReadOk(epoch)
-}
-
-func init() {
-	var val Value
-	typ := reflect.TypeOf(val)
-	name := typ.Name()
-	if ptrs := gvsync.PointersInType(typ, name); len(ptrs) != 0 {
-		panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n")))
-	}
-}
diff --git a/third_party/gvsync/seqatomictest/BUILD b/third_party/gvsync/seqatomictest/BUILD
deleted file mode 100644
index c858c20c4..000000000
--- a/third_party/gvsync/seqatomictest/BUILD
+++ /dev/null
@@ -1,34 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-package(licenses = ["notice"])
-
-go_template_instance(
-    name = "seqatomic_int",
-    out = "seqatomic_int_unsafe.go",
-    package = "seqatomic",
-    suffix = "Int",
-    template = "//third_party/gvsync:generic_seqatomic",
-    types = {
-        "Value": "int",
-    },
-)
-
-go_library(
-    name = "seqatomic",
-    srcs = ["seqatomic_int_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/third_party/gvsync/seqatomic",
-    deps = [
-        "//third_party/gvsync",
-    ],
-)
-
-go_test(
-    name = "seqatomic_test",
-    size = "small",
-    srcs = ["seqatomic_test.go"],
-    embed = [":seqatomic"],
-    deps = [
-        "//third_party/gvsync",
-    ],
-)
diff --git a/third_party/gvsync/seqatomictest/seqatomic_test.go b/third_party/gvsync/seqatomictest/seqatomic_test.go
deleted file mode 100644
index a5447f589..000000000
--- a/third_party/gvsync/seqatomictest/seqatomic_test.go
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package seqatomic
-
-import (
-	"sync/atomic"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/third_party/gvsync"
-)
-
-func TestSeqAtomicLoadUncontended(t *testing.T) {
-	var seq gvsync.SeqCount
-	const want = 1
-	data := want
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicLoadAfterWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	data = want
-	seq.EndWrite()
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicLoadDuringWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	go func() {
-		time.Sleep(time.Second)
-		data = want
-		seq.EndWrite()
-	}()
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicTryLoadUncontended(t *testing.T) {
-	var seq gvsync.SeqCount
-	const want = 1
-	data := want
-	epoch := seq.BeginRead()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
-	}
-}
-
-func TestSeqAtomicTryLoadDuringWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
-	}
-	seq.EndWrite()
-}
-
-func TestSeqAtomicTryLoadAfterWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	seq.EndWrite()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
-	}
-}
-
-func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) {
-	var seq gvsync.SeqCount
-	const want = 42
-	data := want
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			if got := SeqAtomicLoadInt(&seq, &data); got != want {
-				b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-			}
-		}
-	})
-}
-
-func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) {
-	var seq gvsync.SeqCount
-	const want = 42
-	data := want
-	b.RunParallel(func(pb *testing.PB) {
-		epoch := seq.BeginRead()
-		for pb.Next() {
-			if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
-				b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
-			}
-		}
-	})
-}
-
-// For comparison:
-func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) {
-	var a atomic.Value
-	const want = 42
-	a.Store(int(want))
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			if got := a.Load().(int); got != want {
-				b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want)
-			}
-		}
-	})
-}
diff --git a/third_party/gvsync/seqcount.go b/third_party/gvsync/seqcount.go
deleted file mode 100644
index 2c9c2c3d6..000000000
--- a/third_party/gvsync/seqcount.go
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package gvsync
-
-import (
-	"fmt"
-	"reflect"
-	"runtime"
-	"sync/atomic"
-)
-
-// SeqCount is a synchronization primitive for optimistic reader/writer
-// synchronization in cases where readers can work with stale data and
-// therefore do not need to block writers.
-//
-// Compared to sync/atomic.Value:
-//
-// - Mutation of SeqCount-protected data does not require memory allocation,
-// whereas atomic.Value generally does. This is a significant advantage when
-// writes are common.
-//
-// - Atomic reads of SeqCount-protected data require copying. This is a
-// disadvantage when atomic reads are common.
-//
-// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other
-// operations to be made atomic with reads of SeqCount-protected data.
-//
-// - SeqCount may be less flexible: as of this writing, SeqCount-protected data
-// cannot include pointers.
-//
-// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected
-// data require instantiating function templates using go_generics (see
-// seqatomic.go).
-type SeqCount struct {
-	// epoch is incremented by BeginWrite and EndWrite, such that epoch is odd
-	// if a writer critical section is active, and a read from data protected
-	// by this SeqCount is atomic iff epoch is the same even value before and
-	// after the read.
-	epoch uint32
-}
-
-// SeqCountEpoch tracks writer critical sections in a SeqCount.
-type SeqCountEpoch struct {
-	val uint32
-}
-
-// We assume that:
-//
-// - All functions in sync/atomic that perform a memory read are at least a
-// read fence: memory reads before calls to such functions cannot be reordered
-// after the call, and memory reads after calls to such functions cannot be
-// reordered before the call, even if those reads do not use sync/atomic.
-//
-// - All functions in sync/atomic that perform a memory write are at least a
-// write fence: memory writes before calls to such functions cannot be
-// reordered after the call, and memory writes after calls to such functions
-// cannot be reordered before the call, even if those writes do not use
-// sync/atomic.
-//
-// As of this writing, the Go memory model completely fails to describe
-// sync/atomic, but these properties are implied by
-// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8.
-
-// BeginRead indicates the beginning of a reader critical section. Reader
-// critical sections DO NOT BLOCK writer critical sections, so operations in a
-// reader critical section MAY RACE with writer critical sections. Races are
-// detected by ReadOk at the end of the reader critical section. Thus, the
-// low-level structure of readers is generally:
-//
-//     for {
-//         epoch := seq.BeginRead()
-//         // do something idempotent with seq-protected data
-//         if seq.ReadOk(epoch) {
-//             break
-//         }
-//     }
-//
-// However, since reader critical sections may race with writer critical
-// sections, the Go race detector will (accurately) flag data races in readers
-// using this pattern. Most users of SeqCount will need to use the
-// SeqAtomicLoad function template in seqatomic.go.
-func (s *SeqCount) BeginRead() SeqCountEpoch {
-	epoch := atomic.LoadUint32(&s.epoch)
-	for epoch&1 != 0 {
-		runtime.Gosched()
-		epoch = atomic.LoadUint32(&s.epoch)
-	}
-	return SeqCountEpoch{epoch}
-}
-
-// ReadOk returns true if the reader critical section initiated by a previous
-// call to BeginRead() that returned epoch did not race with any writer critical
-// sections.
-//
-// ReadOk may be called any number of times during a reader critical section.
-// Reader critical sections do not need to be explicitly terminated; the last
-// call to ReadOk is implicitly the end of the reader critical section.
-func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
-	return atomic.LoadUint32(&s.epoch) == epoch.val
-}
-
-// BeginWrite indicates the beginning of a writer critical section.
-//
-// SeqCount does not support concurrent writer critical sections; clients with
-// concurrent writers must synchronize them using e.g. sync.Mutex.
-func (s *SeqCount) BeginWrite() {
-	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 {
-		panic("SeqCount.BeginWrite during writer critical section")
-	}
-}
-
-// EndWrite ends the effect of a preceding BeginWrite.
-func (s *SeqCount) EndWrite() {
-	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 {
-		panic("SeqCount.EndWrite outside writer critical section")
-	}
-}
-
-// PointersInType returns a list of pointers reachable from values named
-// valName of the given type.
-//
-// PointersInType is not exhaustive, but it is guaranteed that if typ contains
-// at least one pointer, then PointersInTypeOf returns a non-empty list.
-func PointersInType(typ reflect.Type, valName string) []string {
-	switch kind := typ.Kind(); kind {
-	case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-		return nil
-
-	case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer:
-		return []string{valName}
-
-	case reflect.Array:
-		return PointersInType(typ.Elem(), valName+"[]")
-
-	case reflect.Struct:
-		var ptrs []string
-		for i, n := 0, typ.NumField(); i < n; i++ {
-			field := typ.Field(i)
-			ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...)
-		}
-		return ptrs
-
-	default:
-		return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)}
-	}
-}
diff --git a/third_party/gvsync/seqcount_test.go b/third_party/gvsync/seqcount_test.go
deleted file mode 100644
index 085e574b3..000000000
--- a/third_party/gvsync/seqcount_test.go
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package gvsync
-
-import (
-	"reflect"
-	"testing"
-	"time"
-)
-
-func TestSeqCountWriteUncontended(t *testing.T) {
-	var seq SeqCount
-	seq.BeginWrite()
-	seq.EndWrite()
-}
-
-func TestSeqCountReadUncontended(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountBeginReadAfterWrite(t *testing.T) {
-	var seq SeqCount
-	var data int32
-	const want = 1
-	seq.BeginWrite()
-	data = want
-	seq.EndWrite()
-	epoch := seq.BeginRead()
-	if data != want {
-		t.Errorf("Reader: got %v, wanted %v", data, want)
-	}
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountBeginReadDuringWrite(t *testing.T) {
-	var seq SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	go func() {
-		time.Sleep(time.Second)
-		data = want
-		seq.EndWrite()
-	}()
-	epoch := seq.BeginRead()
-	if data != want {
-		t.Errorf("Reader: got %v, wanted %v", data, want)
-	}
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountReadOkAfterWrite(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	seq.EndWrite()
-	if seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got true, wanted false")
-	}
-}
-
-func TestSeqCountReadOkDuringWrite(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	if seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got true, wanted false")
-	}
-	seq.EndWrite()
-}
-
-func BenchmarkSeqCountWriteUncontended(b *testing.B) {
-	var seq SeqCount
-	for i := 0; i < b.N; i++ {
-		seq.BeginWrite()
-		seq.EndWrite()
-	}
-}
-
-func BenchmarkSeqCountReadUncontended(b *testing.B) {
-	var seq SeqCount
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			epoch := seq.BeginRead()
-			if !seq.ReadOk(epoch) {
-				b.Fatalf("ReadOk: got false, wanted true")
-			}
-		}
-	})
-}
-
-func TestPointersInType(t *testing.T) {
-	for _, test := range []struct {
-		name string // used for both test and value name
-		val  interface{}
-		ptrs []string
-	}{
-		{
-			name: "EmptyStruct",
-			val:  struct{}{},
-		},
-		{
-			name: "Int",
-			val:  int(0),
-		},
-		{
-			name: "MixedStruct",
-			val: struct {
-				b             bool
-				I             int
-				ExportedPtr   *struct{}
-				unexportedPtr *struct{}
-				arr           [2]int
-				ptrArr        [2]*int
-				nestedStruct  struct {
-					nestedNonptr int
-					nestedPtr    *int
-				}
-				structArr [1]struct {
-					nonptr int
-					ptr    *int
-				}
-			}{},
-			ptrs: []string{
-				"MixedStruct.ExportedPtr",
-				"MixedStruct.unexportedPtr",
-				"MixedStruct.ptrArr[]",
-				"MixedStruct.nestedStruct.nestedPtr",
-				"MixedStruct.structArr[].ptr",
-			},
-		},
-	} {
-		t.Run(test.name, func(t *testing.T) {
-			typ := reflect.TypeOf(test.val)
-			ptrs := PointersInType(typ, test.name)
-			t.Logf("Found pointers: %v", ptrs)
-			if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) {
-				t.Errorf("Got %v, wanted %v", ptrs, test.ptrs)
-			}
-		})
-	}
-}
diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
index fa82f8e9b..d412e1ccf 100644
--- a/tools/go_marshal/test/BUILD
+++ b/tools/go_marshal/test/BUILD
@@ -1,9 +1,8 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_marshal:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
-load("//tools/go_marshal:defs.bzl", "go_library")
-
 package_group(
     name = "gomarshal_test",
     packages = [
diff --git a/tools/go_marshal/test/external/BUILD b/tools/go_marshal/test/external/BUILD
index 8fb43179b..9bb89e1da 100644
--- a/tools/go_marshal/test/external/BUILD
+++ b/tools/go_marshal/test/external/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])
-
 load("//tools/go_marshal:defs.bzl", "go_library")
 
+package(licenses = ["notice"])
+
 go_library(
     name = "external",
     testonly = 1,
-- 
cgit v1.2.3


From 4e27ba372e12e3186c0d03b32a7829b0d50f7a89 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 22 Nov 2019 10:42:57 -0800
Subject: tests: include sys/socket.h before linux/if_arp.h

This is how it has to be accoding to the man page.

PiperOrigin-RevId: 281998068
---
 test/syscalls/linux/socket_netlink_util.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index da99f0d60..76e772c48 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -15,6 +15,8 @@
 #ifndef GVISOR_TEST_SYSCALLS_SOCKET_NETLINK_UTIL_H_
 #define GVISOR_TEST_SYSCALLS_SOCKET_NETLINK_UTIL_H_
 
+#include <sys/socket.h>
+// socket.h has to be included before if_arp.h.
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
 
-- 
cgit v1.2.3


From 20279c305ece6a458006999c8dafc5672ca92803 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Tue, 26 Nov 2019 18:19:47 -0800
Subject: Allow open(O_TRUNC) and (f)truncate for proc files.

This allows writable proc and devices files to be opened with O_CREAT|O_TRUNC.
This is encountered most frequently when interacting with proc or devices files
via the command line.
e.g. $ echo 8192 1048576 4194304 > /proc/sys/net/ipv4/tcp_rmem

Also adds a test to test the behavior of open(O_TRUNC), truncate, and ftruncate
on named pipes.

Fixes #1116

PiperOrigin-RevId: 282677425
---
 pkg/sentry/fs/proc/sys_net.go         | 17 ++++++++++++++---
 pkg/sentry/fs/tty/master.go           |  6 +++++-
 pkg/sentry/fs/tty/slave.go            |  6 +++++-
 pkg/sentry/syscalls/linux/sys_file.go | 12 ++++++++++--
 test/syscalls/linux/pipe.cc           | 14 ++++++++++++++
 5 files changed, 48 insertions(+), 7 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index f3b63dfc2..bd93f83fa 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -64,7 +64,7 @@ var _ fs.InodeOperations = (*tcpMemInode)(nil)
 
 func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir tcpMemDir) *fs.Inode {
 	tm := &tcpMemInode{
-		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
 		s:               s,
 		dir:             dir,
 	}
@@ -77,6 +77,11 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir
 	return fs.NewInode(ctx, tm, msrc, sattr)
 }
 
+// Truncate implements fs.InodeOperations.Truncate.
+func (tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
 // GetFile implements fs.InodeOperations.GetFile.
 func (m *tcpMemInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	flags.Pread = true
@@ -168,14 +173,15 @@ func writeSize(dirType tcpMemDir, s inet.Stack, size inet.TCPBufferSize) error {
 
 // +stateify savable
 type tcpSack struct {
+	fsutil.SimpleFileInode
+
 	stack   inet.Stack `state:"wait"`
 	enabled *bool
-	fsutil.SimpleFileInode
 }
 
 func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
 	ts := &tcpSack{
-		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
 		stack:           s,
 	}
 	sattr := fs.StableAttr{
@@ -187,6 +193,11 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f
 	return fs.NewInode(ctx, ts, msrc, sattr)
 }
 
+// Truncate implements fs.InodeOperations.Truncate.
+func (tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
 // GetFile implements fs.InodeOperations.GetFile.
 func (s *tcpSack) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	flags.Pread = true
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index bc56be696..934828c12 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -32,7 +32,6 @@ import (
 // +stateify savable
 type masterInodeOperations struct {
 	fsutil.SimpleFileInode
-	fsutil.InodeNoopTruncate
 
 	// d is the containing dir.
 	d *dirInodeOperations
@@ -77,6 +76,11 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn
 func (mi *masterInodeOperations) Release(ctx context.Context) {
 }
 
+// Truncate implements fs.InodeOperations.Truncate.
+func (masterInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
 // GetFile implements fs.InodeOperations.GetFile.
 //
 // It allocates a new terminal.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 4cbea0367..2a51e6bab 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -31,7 +31,6 @@ import (
 // +stateify savable
 type slaveInodeOperations struct {
 	fsutil.SimpleFileInode
-	fsutil.InodeNoopTruncate
 
 	// d is the containing dir.
 	d *dirInodeOperations
@@ -73,6 +72,11 @@ func (si *slaveInodeOperations) Release(ctx context.Context) {
 	si.t.DecRef()
 }
 
+// Truncate implements fs.InodeOperations.Truncate.
+func (slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
 // GetFile implements fs.InodeOperations.GetFile.
 //
 // This may race with destruction of the terminal. If the terminal is gone, it
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 167c2b60b..3b9181002 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -171,6 +171,9 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint
 			}
 		}
 
+		// Truncate is called when O_TRUNC is specified for any kind of
+		// existing Dirent. Behavior is delegated to the entry's Truncate
+		// implementation.
 		if flags&linux.O_TRUNC != 0 {
 			if err := d.Inode.Truncate(t, d, 0); err != nil {
 				return err
@@ -397,7 +400,9 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
 				return err
 			}
 
-			// Should we truncate the file?
+			// Truncate is called when O_TRUNC is specified for any kind of
+			// existing Dirent. Behavior is delegated to the entry's Truncate
+			// implementation.
 			if flags&linux.O_TRUNC != 0 {
 				if err := found.Inode.Truncate(t, found, 0); err != nil {
 					return err
@@ -1484,6 +1489,8 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		if fs.IsDir(d.Inode.StableAttr) {
 			return syserror.EISDIR
 		}
+		// In contrast to open(O_TRUNC), truncate(2) is only valid for file
+		// types.
 		if !fs.IsFile(d.Inode.StableAttr) {
 			return syserror.EINVAL
 		}
@@ -1522,7 +1529,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, syserror.EINVAL
 	}
 
-	// Note that this is different from truncate(2) above, where a
+	// In contrast to open(O_TRUNC), truncate(2) is only valid for file
+	// types. Note that this is different from truncate(2) above, where a
 	// directory returns EISDIR.
 	if !fs.IsFile(file.Dirent.Inode.StableAttr) {
 		return 0, nil, syserror.EINVAL
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index c0b354e65..ac9b21b24 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -212,6 +212,20 @@ TEST(Pipe2Test, BadOptions) {
   EXPECT_THAT(pipe2(fds, 0xDEAD), SyscallFailsWithErrno(EINVAL));
 }
 
+// Tests that opening named pipes with O_TRUNC shouldn't cause an error, but
+// calls to (f)truncate should.
+TEST(NamedPipeTest, Truncate) {
+  const std::string tmp_path = NewTempAbsPath();
+  SKIP_IF(mkfifo(tmp_path.c_str(), 0644) != 0);
+
+  ASSERT_THAT(open(tmp_path.c_str(), O_NONBLOCK | O_RDONLY), SyscallSucceeds());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(tmp_path.c_str(), O_RDWR | O_NONBLOCK | O_TRUNC));
+
+  ASSERT_THAT(truncate(tmp_path.c_str(), 0), SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(ftruncate(fd.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
 TEST_P(PipeTest, Seek) {
   SKIP_IF(!CreateBlocking());
 
-- 
cgit v1.2.3


From 58afb4be695e6804925ba2be5f2d8c245f079cba Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 27 Nov 2019 13:47:44 -0800
Subject: Add floating point exception tests

PiperOrigin-RevId: 282828273
---
 test/syscalls/linux/exceptions.cc | 181 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc
index 370e85166..3d564e720 100644
--- a/test/syscalls/linux/exceptions.cc
+++ b/test/syscalls/linux/exceptions.cc
@@ -22,6 +22,23 @@
 namespace gvisor {
 namespace testing {
 
+// Default value for the x87 FPU control word. See Intel SDM Vol 1, Ch 8.1.5
+// "x87 FPU Control Word".
+constexpr uint16_t kX87ControlWordDefault = 0x37f;
+
+// Mask for the divide-by-zero exception.
+constexpr uint16_t kX87ControlWordDiv0Mask = 1 << 2;
+
+// Default value for the SSE control register (MXCSR). See Intel SDM Vol 1, Ch
+// 11.6.4 "Initialization of SSE/SSE3 Extensions".
+constexpr uint32_t kMXCSRDefault = 0x1f80;
+
+// Mask for the divide-by-zero exception.
+constexpr uint32_t kMXCSRDiv0Mask = 1 << 9;
+
+// Flag for a pending divide-by-zero exception.
+constexpr uint32_t kMXCSRDiv0Flag = 1 << 2;
+
 void inline Halt() { asm("hlt\r\n"); }
 
 void inline SetAlignmentCheck() {
@@ -107,6 +124,170 @@ TEST(ExceptionTest, DivideByZero) {
       ::testing::KilledBySignal(SIGFPE), "");
 }
 
+// By default, x87 exceptions are masked and simply return a default value.
+TEST(ExceptionTest, X87DivideByZeroMasked) {
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
+  asm("fildl %[value]\r\n"
+      "fidivl %[divisor]\r\n"
+      "fistpl %[quotient]\r\n"
+      : [ quotient ] "=m"(quotient)
+      : [ value ] "m"(value), [ divisor ] "m"(divisor));
+
+  EXPECT_EQ(quotient, INT32_MIN);
+}
+
+// When unmasked, division by zero raises SIGFPE.
+TEST(ExceptionTest, X87DivideByZeroUnmasked) {
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+  EXPECT_EXIT(
+      {
+        // Clear the divide by zero exception mask.
+        constexpr uint16_t kControlWord =
+            kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
+
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
+        asm volatile(
+            "fldcw %[cw]\r\n"
+            "fildl %[value]\r\n"
+            "fidivl %[divisor]\r\n"
+            "fistpl %[quotient]\r\n"
+            : [ quotient ] "=m"(quotient)
+            : [ cw ] "m"(kControlWord), [ value ] "m"(value),
+              [ divisor ] "m"(divisor));
+      },
+      ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// Pending exceptions in the x87 status register are not clobbered by syscalls.
+TEST(ExceptionTest, X87StatusClobber) {
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+  EXPECT_EXIT(
+      {
+        // Clear the divide by zero exception mask.
+        constexpr uint16_t kControlWord =
+            kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
+
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
+        asm volatile(
+            "fildl %[value]\r\n"
+            "fidivl %[divisor]\r\n"
+            // Exception is masked, so it does not occur here.
+            "fistpl %[quotient]\r\n"
+
+            // SYS_getpid placed in rax by constraint.
+            "syscall\r\n"
+
+            // Unmask exception. The syscall didn't clobber the pending
+            // exception, so now it can be raised.
+            //
+            // N.B. "a floating-point exception will be generated upon execution
+            // of the *next* floating-point instruction".
+            "fldcw %[cw]\r\n"
+            "fwait\r\n"
+            : [ quotient ] "=m"(quotient)
+            : [ value ] "m"(value), [ divisor ] "m"(divisor), "a"(SYS_getpid),
+              [ cw ] "m"(kControlWord)
+            : "rcx", "r11");
+      },
+      ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// By default, SSE exceptions are masked and simply return a default value.
+TEST(ExceptionTest, SSEDivideByZeroMasked) {
+  uint32_t status;
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
+  asm("cvtsi2ssl %[value], %%xmm0\r\n"
+      "cvtsi2ssl %[divisor], %%xmm1\r\n"
+      "divss %%xmm1, %%xmm0\r\n"
+      "cvtss2sil %%xmm0, %[quotient]\r\n"
+      : [ quotient ] "=r"(quotient), [ status ] "=r"(status)
+      : [ value ] "r"(value), [ divisor ] "r"(divisor)
+      : "xmm0", "xmm1");
+
+  EXPECT_EQ(quotient, INT32_MIN);
+}
+
+// When unmasked, division by zero raises SIGFPE.
+TEST(ExceptionTest, SSEDivideByZeroUnmasked) {
+  // See above.
+  struct sigaction sa = {};
+  sa.sa_handler = SIG_DFL;
+  auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+  EXPECT_EXIT(
+      {
+        // Clear the divide by zero exception mask.
+        constexpr uint32_t kMXCSR = kMXCSRDefault & ~kMXCSRDiv0Mask;
+
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
+        asm volatile(
+            "ldmxcsr %[mxcsr]\r\n"
+            "cvtsi2ssl %[value], %%xmm0\r\n"
+            "cvtsi2ssl %[divisor], %%xmm1\r\n"
+            "divss %%xmm1, %%xmm0\r\n"
+            "cvtss2sil %%xmm0, %[quotient]\r\n"
+            : [ quotient ] "=r"(quotient)
+            : [ mxcsr ] "m"(kMXCSR), [ value ] "r"(value),
+              [ divisor ] "r"(divisor)
+            : "xmm0", "xmm1");
+      },
+      ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// Pending exceptions in the SSE status register are not clobbered by syscalls.
+TEST(ExceptionTest, SSEStatusClobber) {
+  uint32_t mxcsr;
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
+  asm("cvtsi2ssl %[value], %%xmm0\r\n"
+      "cvtsi2ssl %[divisor], %%xmm1\r\n"
+      "divss %%xmm1, %%xmm0\r\n"
+      // Exception is masked, so it does not occur here.
+      "cvtss2sil %%xmm0, %[quotient]\r\n"
+
+      // SYS_getpid placed in rax by constraint.
+      "syscall\r\n"
+
+      // Intel SDM Vol 1, Ch 10.2.3.1 "SIMD Floating-Point Mask and Flag Bits":
+      // "If LDMXCSR or FXRSTOR clears a mask bit and sets the corresponding
+      // exception flag bit, a SIMD floating-point exception will not be
+      // generated as a result of this change. The unmasked exception will be
+      // generated only upon the execution of the next SSE/SSE2/SSE3 instruction
+      // that detects the unmasked exception condition."
+      //
+      // Though ambiguous, empirical evidence indicates that this means that
+      // exception flags set in the status register will never cause an
+      // exception to be raised; only a new exception condition will do so.
+      //
+      // Thus here we just check for the flag itself rather than trying to raise
+      // the exception.
+      "stmxcsr %[mxcsr]\r\n"
+      : [ quotient ] "=r"(quotient), [ mxcsr ] "+m"(mxcsr)
+      : [ value ] "r"(value), [ divisor ] "r"(divisor), "a"(SYS_getpid)
+      : "xmm0", "xmm1", "rcx", "r11");
+
+  EXPECT_TRUE(mxcsr & kMXCSRDiv0Flag);
+}
+
 TEST(ExceptionTest, IOAccessFault) {
   // See above.
   struct sigaction sa = {};
-- 
cgit v1.2.3


From aa70523da21534d8518eaa52f36db002e3d61885 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Mon, 2 Dec 2019 05:37:09 -0800
Subject: Port tests in udp_socket.cc to Fuchsia

Separate out a test in udp_socket.cc that depends on <linux/errqueue.h> so the
rest of the tests can run on Fuchsia.

PiperOrigin-RevId: 283322633
---
 test/syscalls/linux/BUILD                          |   23 +-
 test/syscalls/linux/udp_socket.cc                  | 1321 +-------------------
 .../linux/udp_socket_errqueue_test_case.cc         |   54 +
 test/syscalls/linux/udp_socket_test_cases.cc       | 1279 +++++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.h        |   74 ++
 5 files changed, 1427 insertions(+), 1324 deletions(-)
 create mode 100644 test/syscalls/linux/udp_socket_errqueue_test_case.cc
 create mode 100644 test/syscalls/linux/udp_socket_test_cases.cc
 create mode 100644 test/syscalls/linux/udp_socket_test_cases.h

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 6345ea28c..2dd115409 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3351,11 +3351,15 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "udp_socket_test",
+cc_library(
+    name = "udp_socket_test_cases",
     testonly = 1,
-    srcs = ["udp_socket.cc"],
-    linkstatic = 1,
+    srcs = [
+        "udp_socket_test_cases.cc",
+    ] + select_for_linux([
+        "udp_socket_errqueue_test_case.cc",
+    ]),
+    hdrs = ["udp_socket_test_cases.h"],
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
@@ -3366,6 +3370,17 @@ cc_binary(
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
+    alwayslink = 1,
+)
+
+cc_binary(
+    name = "udp_socket_test",
+    testonly = 1,
+    srcs = ["udp_socket.cc"],
+    linkstatic = 1,
+    deps = [
+        ":udp_socket_test_cases",
+    ],
 )
 
 cc_binary(
diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc
index 111dbacdf..7a8ac30a4 100644
--- a/test/syscalls/linux/udp_socket.cc
+++ b/test/syscalls/linux/udp_socket.cc
@@ -12,1332 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <arpa/inet.h>
-#include <fcntl.h>
-#include <linux/errqueue.h>
-#include <netinet/in.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-#include "gtest/gtest.h"
-#include "absl/base/macros.h"
-#include "absl/time/clock.h"
-#include "absl/time/time.h"
-#include "test/syscalls/linux/socket_test_util.h"
-#include "test/syscalls/linux/unix_domain_socket_test_util.h"
-#include "test/util/test_util.h"
-#include "test/util/thread_util.h"
+#include "test/syscalls/linux/udp_socket_test_cases.h"
 
 namespace gvisor {
 namespace testing {
 
 namespace {
 
-// The initial port to be be used on gvisor.
-constexpr int TestPort = 40000;
-
-// Fixture for tests parameterized by the address family to use (AF_INET and
-// AF_INET6) when creating sockets.
-class UdpSocketTest : public ::testing::TestWithParam<AddressFamily> {
- protected:
-  // Creates two sockets that will be used by test cases.
-  void SetUp() override;
-
-  // Closes the sockets created by SetUp().
-  void TearDown() override {
-    EXPECT_THAT(close(s_), SyscallSucceeds());
-    EXPECT_THAT(close(t_), SyscallSucceeds());
-
-    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
-      ASSERT_NO_ERRNO(FreeAvailablePort(ports_[i]));
-    }
-  }
-
-  // First UDP socket.
-  int s_;
-
-  // Second UDP socket.
-  int t_;
-
-  // The length of the socket address.
-  socklen_t addrlen_;
-
-  // Initialized address pointing to loopback and port TestPort+i.
-  struct sockaddr* addr_[3];
-
-  // Initialize "any" address.
-  struct sockaddr* anyaddr_;
-
-  // Used ports.
-  int ports_[3];
-
- private:
-  // Storage for the loopback addresses.
-  struct sockaddr_storage addr_storage_[3];
-
-  // Storage for the "any" address.
-  struct sockaddr_storage anyaddr_storage_;
-};
-
-// Gets a pointer to the port component of the given address.
-uint16_t* Port(struct sockaddr_storage* addr) {
-  switch (addr->ss_family) {
-    case AF_INET: {
-      auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
-      return &sin->sin_port;
-    }
-    case AF_INET6: {
-      auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
-      return &sin6->sin6_port;
-    }
-  }
-
-  return nullptr;
-}
-
-void UdpSocketTest::SetUp() {
-  int type;
-  if (GetParam() == AddressFamily::kIpv4) {
-    type = AF_INET;
-    auto sin = reinterpret_cast<struct sockaddr_in*>(&anyaddr_storage_);
-    addrlen_ = sizeof(*sin);
-    sin->sin_addr.s_addr = htonl(INADDR_ANY);
-  } else {
-    type = AF_INET6;
-    auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&anyaddr_storage_);
-    addrlen_ = sizeof(*sin6);
-    if (GetParam() == AddressFamily::kIpv6) {
-      sin6->sin6_addr = IN6ADDR_ANY_INIT;
-    } else {
-      TestAddress const& v4_mapped_any = V4MappedAny();
-      sin6->sin6_addr =
-          reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
-              ->sin6_addr;
-    }
-  }
-  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
-
-  ASSERT_THAT(t_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
-
-  memset(&anyaddr_storage_, 0, sizeof(anyaddr_storage_));
-  anyaddr_ = reinterpret_cast<struct sockaddr*>(&anyaddr_storage_);
-  anyaddr_->sa_family = type;
-
-  if (gvisor::testing::IsRunningOnGvisor()) {
-    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
-      ports_[i] = TestPort + i;
-    }
-  } else {
-    // When not under gvisor, use utility function to pick port. Assert that
-    // all ports are different.
-    std::string error;
-    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
-      // Find an unused port, we specify port 0 to allow the kernel to provide
-      // the port.
-      bool unique = true;
-      do {
-        ports_[i] = ASSERT_NO_ERRNO_AND_VALUE(PortAvailable(
-            0, AddressFamily::kDualStack, SocketType::kUdp, false));
-        ASSERT_GT(ports_[i], 0);
-        for (size_t j = 0; j < i; ++j) {
-          if (ports_[j] == ports_[i]) {
-            unique = false;
-            break;
-          }
-        }
-      } while (!unique);
-    }
-  }
-
-  // Initialize the sockaddrs.
-  for (size_t i = 0; i < ABSL_ARRAYSIZE(addr_); ++i) {
-    memset(&addr_storage_[i], 0, sizeof(addr_storage_[i]));
-
-    addr_[i] = reinterpret_cast<struct sockaddr*>(&addr_storage_[i]);
-    addr_[i]->sa_family = type;
-
-    switch (type) {
-      case AF_INET: {
-        auto sin = reinterpret_cast<struct sockaddr_in*>(addr_[i]);
-        sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-        sin->sin_port = htons(ports_[i]);
-        break;
-      }
-      case AF_INET6: {
-        auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr_[i]);
-        sin6->sin6_addr = in6addr_loopback;
-        sin6->sin6_port = htons(ports_[i]);
-        break;
-      }
-    }
-  }
-}
-
-TEST_P(UdpSocketTest, Creation) {
-  int type = AF_INET6;
-  if (GetParam() == AddressFamily::kIpv4) {
-    type = AF_INET;
-  }
-
-  int s_;
-
-  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
-  EXPECT_THAT(close(s_), SyscallSucceeds());
-
-  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, 0), SyscallSucceeds());
-  EXPECT_THAT(close(s_), SyscallSucceeds());
-
-  ASSERT_THAT(s_ = socket(type, SOCK_STREAM, IPPROTO_UDP), SyscallFails());
-}
-
-TEST_P(UdpSocketTest, Getsockname) {
-  // Check that we're not bound.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, anyaddr_, addrlen_), 0);
-
-  // Bind, then check that we get the right address.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, Getpeername) {
-  // Check that we're not connected.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallFailsWithErrno(ENOTCONN));
-
-  // Connect, then check that we get the right address.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, SendNotConnected) {
-  // Do send & write, they must fail.
-  char buf[512];
-  EXPECT_THAT(send(s_, buf, sizeof(buf), 0),
-              SyscallFailsWithErrno(EDESTADDRREQ));
-
-  EXPECT_THAT(write(s_, buf, sizeof(buf)), SyscallFailsWithErrno(EDESTADDRREQ));
-
-  // Use sendto.
-  ASSERT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Check that we're bound now.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_NE(*Port(&addr), 0);
-}
-
-TEST_P(UdpSocketTest, ConnectBinds) {
-  // Connect the socket.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Check that we're bound now.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_NE(*Port(&addr), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveNotBound) {
-  char buf[512];
-  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, Bind) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Try to bind again.
-  EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
-
-  // Check that we're still bound to the original address.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, BindInUse) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Try to bind again.
-  EXPECT_THAT(bind(t_, addr_[0], addrlen_), SyscallFailsWithErrno(EADDRINUSE));
-}
-
-TEST_P(UdpSocketTest, ReceiveAfterConnect) {
-  // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Get the address s_ was bound to during connect.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-
-  // Send from t_ to s_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
-                     reinterpret_cast<sockaddr*>(&addr), addrlen),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveAfterDisconnect) {
-  // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Get the address s_ was bound to during connect.
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  EXPECT_EQ(addrlen, addrlen_);
-
-  for (int i = 0; i < 2; i++) {
-    // Send from t_ to s_.
-    char buf[512];
-    RandomizeBuffer(buf, sizeof(buf));
-    EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-                SyscallSucceeds());
-    ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
-                       reinterpret_cast<sockaddr*>(&addr), addrlen),
-                SyscallSucceedsWithValue(sizeof(buf)));
-
-    // Receive the data.
-    char received[sizeof(buf)];
-    EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-                SyscallSucceedsWithValue(sizeof(received)));
-    EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-
-    // Disconnect s_.
-    struct sockaddr addr = {};
-    addr.sa_family = AF_UNSPEC;
-    ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), SyscallSucceeds());
-    // Connect s_ loopback:TestPort.
-    ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-  }
-}
-
-TEST_P(UdpSocketTest, Connect) {
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Check that we're connected to the right peer.
-  struct sockaddr_storage peer;
-  socklen_t peerlen = sizeof(peer);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
-              SyscallSucceeds());
-  EXPECT_EQ(peerlen, addrlen_);
-  EXPECT_EQ(memcmp(&peer, addr_[0], addrlen_), 0);
-
-  // Try to bind after connect.
-  EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
-
-  // Try to connect again.
-  EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
-  // Check that peer name changed.
-  peerlen = sizeof(peer);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
-              SyscallSucceeds());
-  EXPECT_EQ(peerlen, addrlen_);
-  EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
-}
-
-void ConnectAny(AddressFamily family, int sockfd, uint16_t port) {
-  struct sockaddr_storage addr = {};
-
-  // Precondition check.
-  {
-    socklen_t addrlen = sizeof(addr);
-    EXPECT_THAT(
-        getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-        SyscallSucceeds());
-
-    if (family == AddressFamily::kIpv4) {
-      auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
-      EXPECT_EQ(addrlen, sizeof(*addr_out));
-      EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
-    } else {
-      auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
-      EXPECT_EQ(addrlen, sizeof(*addr_out));
-      struct in6_addr any = IN6ADDR_ANY_INIT;
-      EXPECT_EQ(memcmp(&addr_out->sin6_addr, &any, sizeof(in6_addr)), 0);
-    }
-
-    {
-      socklen_t addrlen = sizeof(addr);
-      EXPECT_THAT(
-          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-          SyscallFailsWithErrno(ENOTCONN));
-    }
-
-    struct sockaddr_storage baddr = {};
-    if (family == AddressFamily::kIpv4) {
-      auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
-      addrlen = sizeof(*addr_in);
-      addr_in->sin_family = AF_INET;
-      addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
-      addr_in->sin_port = port;
-    } else {
-      auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
-      addrlen = sizeof(*addr_in);
-      addr_in->sin6_family = AF_INET6;
-      addr_in->sin6_port = port;
-      if (family == AddressFamily::kIpv6) {
-        addr_in->sin6_addr = IN6ADDR_ANY_INIT;
-      } else {
-        TestAddress const& v4_mapped_any = V4MappedAny();
-        addr_in->sin6_addr =
-            reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
-                ->sin6_addr;
-      }
-    }
-
-    // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
-    if (port == 0) {
-      SKIP_IF(IsRunningOnGvisor());
-    }
-
-    ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
-                SyscallSucceeds());
-  }
-
-  // Postcondition check.
-  {
-    socklen_t addrlen = sizeof(addr);
-    EXPECT_THAT(
-        getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-        SyscallSucceeds());
-
-    if (family == AddressFamily::kIpv4) {
-      auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
-      EXPECT_EQ(addrlen, sizeof(*addr_out));
-      EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
-    } else {
-      auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
-      EXPECT_EQ(addrlen, sizeof(*addr_out));
-      struct in6_addr loopback;
-      if (family == AddressFamily::kIpv6) {
-        loopback = IN6ADDR_LOOPBACK_INIT;
-      } else {
-        TestAddress const& v4_mapped_loopback = V4MappedLoopback();
-        loopback = reinterpret_cast<const struct sockaddr_in6*>(
-                       &v4_mapped_loopback.addr)
-                       ->sin6_addr;
-      }
-
-      EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
-    }
-
-    addrlen = sizeof(addr);
-    if (port == 0) {
-      EXPECT_THAT(
-          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-          SyscallFailsWithErrno(ENOTCONN));
-    } else {
-      EXPECT_THAT(
-          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-          SyscallSucceeds());
-    }
-  }
-}
-
-TEST_P(UdpSocketTest, ConnectAny) { ConnectAny(GetParam(), s_, 0); }
-
-TEST_P(UdpSocketTest, ConnectAnyWithPort) {
-  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
-  ConnectAny(GetParam(), s_, port);
-}
-
-void DisconnectAfterConnectAny(AddressFamily family, int sockfd, int port) {
-  struct sockaddr_storage addr = {};
-
-  socklen_t addrlen = sizeof(addr);
-  struct sockaddr_storage baddr = {};
-  if (family == AddressFamily::kIpv4) {
-    auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
-    addrlen = sizeof(*addr_in);
-    addr_in->sin_family = AF_INET;
-    addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
-    addr_in->sin_port = port;
-  } else {
-    auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
-    addrlen = sizeof(*addr_in);
-    addr_in->sin6_family = AF_INET6;
-    addr_in->sin6_port = port;
-    if (family == AddressFamily::kIpv6) {
-      addr_in->sin6_addr = IN6ADDR_ANY_INIT;
-    } else {
-      TestAddress const& v4_mapped_any = V4MappedAny();
-      addr_in->sin6_addr =
-          reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
-              ->sin6_addr;
-    }
-  }
-
-  // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
-  if (port == 0) {
-    SKIP_IF(IsRunningOnGvisor());
-  }
-
-  ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
-              SyscallSucceeds());
-  // Now the socket is bound to the loopback address.
-
-  // Disconnect
-  addrlen = sizeof(addr);
-  addr.ss_family = AF_UNSPEC;
-  ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&addr), addrlen),
-              SyscallSucceeds());
-
-  // Check that after disconnect the socket is bound to the ANY address.
-  EXPECT_THAT(getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-  if (family == AddressFamily::kIpv4) {
-    auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
-    EXPECT_EQ(addrlen, sizeof(*addr_out));
-    EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
-  } else {
-    auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
-    EXPECT_EQ(addrlen, sizeof(*addr_out));
-    struct in6_addr loopback = IN6ADDR_ANY_INIT;
-
-    EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
-  }
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterConnectAny) {
-  DisconnectAfterConnectAny(GetParam(), s_, 0);
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) {
-  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
-  DisconnectAfterConnectAny(GetParam(), s_, port);
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterBind) {
-  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
-  // Connect the socket.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  struct sockaddr_storage addr = {};
-  addr.ss_family = AF_UNSPEC;
-  EXPECT_THAT(
-      connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
-      SyscallSucceeds());
-
-  // Check that we're still bound.
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterBindToAny) {
-  struct sockaddr_storage baddr = {};
-  socklen_t addrlen;
-  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
-  if (GetParam() == AddressFamily::kIpv4) {
-    auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
-    addr_in->sin_family = AF_INET;
-    addr_in->sin_port = port;
-    addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
-  } else {
-    auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
-    addr_in->sin6_family = AF_INET6;
-    addr_in->sin6_port = port;
-    addr_in->sin6_scope_id = 0;
-    addr_in->sin6_addr = IN6ADDR_ANY_INIT;
-  }
-  ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_),
-              SyscallSucceeds());
-  // Connect the socket.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  struct sockaddr_storage addr = {};
-  addr.ss_family = AF_UNSPEC;
-  EXPECT_THAT(
-      connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
-      SyscallSucceeds());
-
-  // Check that we're still bound.
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceeds());
-
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, &baddr, addrlen), 0);
-
-  addrlen = sizeof(addr);
-  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, Disconnect) {
-  for (int i = 0; i < 2; i++) {
-    // Try to connect again.
-    EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
-    // Check that we're connected to the right peer.
-    struct sockaddr_storage peer;
-    socklen_t peerlen = sizeof(peer);
-    EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
-                SyscallSucceeds());
-    EXPECT_EQ(peerlen, addrlen_);
-    EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
-
-    // Try to disconnect.
-    struct sockaddr_storage addr = {};
-    addr.ss_family = AF_UNSPEC;
-    EXPECT_THAT(
-        connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
-        SyscallSucceeds());
-
-    peerlen = sizeof(peer);
-    EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
-                SyscallFailsWithErrno(ENOTCONN));
-
-    // Check that we're still bound.
-    socklen_t addrlen = sizeof(addr);
-    EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
-                SyscallSucceeds());
-    EXPECT_EQ(addrlen, addrlen_);
-    EXPECT_EQ(*Port(&addr), 0);
-  }
-}
-
-TEST_P(UdpSocketTest, ConnectBadAddress) {
-  struct sockaddr addr = {};
-  addr.sa_family = addr_[0]->sa_family;
-  ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)),
-              SyscallFailsWithErrno(EINVAL));
-}
-
-TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send to a different destination than we're connected to.
-  char buf[512];
-  EXPECT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[1], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-}
-
-TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
-  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+1.
-  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  char buf[3];
-  // Send zero length packet from s_ to t_.
-  ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
-  // Receive the packet.
-  char received[3];
-  EXPECT_THAT(read(t_, received, sizeof(received)),
-              SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
-  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+1.
-  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Set t_ to non-blocking.
-  int opts = 0;
-  ASSERT_THAT(opts = fcntl(t_, F_GETFL), SyscallSucceeds());
-  ASSERT_THAT(fcntl(t_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
-
-  char buf[3];
-  // Send zero length packet from s_ to t_.
-  ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
-  // Receive the packet.
-  char received[3];
-  EXPECT_THAT(read(t_, received, sizeof(received)),
-              SyscallSucceedsWithValue(0));
-  EXPECT_THAT(read(t_, received, sizeof(received)),
-              SyscallFailsWithErrno(EAGAIN));
-}
-
-TEST_P(UdpSocketTest, SendAndReceiveNotConnected) {
-  // Bind s_ to loopback.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send some data to s_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, SendAndReceiveConnected) {
-  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+1.
-  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Send some data from t_ to s_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveFromNotConnected) {
-  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+2.
-  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
-
-  // Send some data from t_ to s_.
-  char buf[512];
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Check that the data isn't_ received because it was sent from a different
-  // address than we're connected.
-  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReceiveBeforeConnect) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+2.
-  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
-
-  // Send some data from t_ to s_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Connect to loopback:TestPort+1.
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Receive the data. It works because it was sent before the connect.
-  char received[sizeof(buf)];
-  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-
-  // Send again. This time it should not be received.
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReceiveFrom) {
-  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Bind t_ to loopback:TestPort+1.
-  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Send some data from t_ to s_.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // Receive the data and sender address.
-  char received[sizeof(buf)];
-  struct sockaddr_storage addr;
-  socklen_t addrlen = sizeof(addr);
-  EXPECT_THAT(recvfrom(s_, received, sizeof(received), 0,
-                       reinterpret_cast<sockaddr*>(&addr), &addrlen),
-              SyscallSucceedsWithValue(sizeof(received)));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-  EXPECT_EQ(addrlen, addrlen_);
-  EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, Listen) {
-  ASSERT_THAT(listen(s_, SOMAXCONN), SyscallFailsWithErrno(EOPNOTSUPP));
-}
-
-TEST_P(UdpSocketTest, Accept) {
-  ASSERT_THAT(accept(s_, nullptr, nullptr), SyscallFailsWithErrno(EOPNOTSUPP));
-}
-
-// This test validates that a read shutdown with pending data allows the read
-// to proceed with the data before returning EAGAIN.
-TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
-  char received[512];
-
-  // Bind t_ to loopback:TestPort+2.
-  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
-
-  // Connect the socket, then try to shutdown again.
-  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
-  // Verify that we get EWOULDBLOCK when there is nothing to read.
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  const char* buf = "abc";
-  EXPECT_THAT(write(t_, buf, 3), SyscallSucceedsWithValue(3));
-
-  int opts = 0;
-  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
-  ASSERT_THAT(fcntl(s_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
-  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
-  ASSERT_NE(opts & O_NONBLOCK, 0);
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
-  // We should get the data even though read has been shutdown.
-  EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2));
-
-  // Because we read less than the entire packet length, since it's a packet
-  // based socket any subsequent reads should return EWOULDBLOCK.
-  EXPECT_THAT(recv(s_, received, 1, 0), SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-// This test is validating that even after a socket is shutdown if it's
-// reconnected it will reset the shutdown state.
-TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
-  char received[512];
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Connect the socket, then try to shutdown again.
-  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReadShutdown) {
-  char received[512];
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Connect the socket, then try to shutdown again.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
-  char received[512];
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Connect the socket, then shutdown from another thread.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  ScopedThread t([&] {
-    absl::SleepFor(absl::Milliseconds(200));
-    EXPECT_THAT(shutdown(this->s_, SHUT_RD), SyscallSucceeds());
-  });
-  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(0));
-  t.Join();
-
-  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, WriteShutdown) {
-  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds());
-}
-
-TEST_P(UdpSocketTest, SynchronousReceive) {
-  // Bind s_ to loopback.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send some data to s_ from another thread.
-  char buf[512];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  // Receive the data prior to actually starting the other thread.
-  char received[512];
-  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EWOULDBLOCK));
-
-  // Start the thread.
-  ScopedThread t([&] {
-    absl::SleepFor(absl::Milliseconds(200));
-    ASSERT_THAT(
-        sendto(this->t_, buf, sizeof(buf), 0, this->addr_[0], this->addrlen_),
-        SyscallSucceedsWithValue(sizeof(buf)));
-  });
-
-  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
-              SyscallSucceedsWithValue(512));
-  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send 3 packets from t_ to s_.
-  constexpr int psize = 100;
-  char buf[3 * psize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
-                SyscallSucceedsWithValue(psize));
-  }
-
-  // Receive the data as 3 separate packets.
-  char received[6 * psize];
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_THAT(recv(s_, received + i * psize, 3 * psize, 0),
-                SyscallSucceedsWithValue(psize));
-  }
-  EXPECT_EQ(memcmp(buf, received, 3 * psize), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Direct writes from t_ to s_.
-  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send 2 packets from t_ to s_, where each packet's data consists of 2
-  // discontiguous iovecs.
-  constexpr size_t kPieceSize = 100;
-  char buf[4 * kPieceSize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[2];
-    for (int j = 0; j < 2; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    ASSERT_THAT(writev(t_, iov, 2), SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-
-  // Receive the data as 2 separate packets.
-  char received[6 * kPieceSize];
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[3];
-    for (int j = 0; j < 3; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    ASSERT_THAT(readv(s_, iov, 3), SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Send 2 packets from t_ to s_, where each packet's data consists of 2
-  // discontiguous iovecs.
-  constexpr size_t kPieceSize = 100;
-  char buf[4 * kPieceSize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[2];
-    for (int j = 0; j < 2; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    struct msghdr msg = {};
-    msg.msg_name = addr_[0];
-    msg.msg_namelen = addrlen_;
-    msg.msg_iov = iov;
-    msg.msg_iovlen = 2;
-    ASSERT_THAT(sendmsg(t_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-
-  // Receive the data as 2 separate packets.
-  char received[6 * kPieceSize];
-  for (int i = 0; i < 2; i++) {
-    struct iovec iov[3];
-    for (int j = 0; j < 3; j++) {
-      iov[j].iov_base = reinterpret_cast<void*>(
-          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
-      iov[j].iov_len = kPieceSize;
-    }
-    struct msghdr msg = {};
-    msg.msg_iov = iov;
-    msg.msg_iovlen = 3;
-    ASSERT_THAT(recvmsg(s_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
-  }
-  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
-}
-
-TEST_P(UdpSocketTest, FIONREADShutdown) {
-  int n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // A UDP socket must be connected before it can be shutdown.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-}
-
-TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
-  int n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // A UDP socket must be connected before it can be shutdown.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  const char str[] = "abc";
-  ASSERT_THAT(send(s_, str, sizeof(str), 0),
-              SyscallSucceedsWithValue(sizeof(str)));
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, sizeof(str));
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, sizeof(str));
-}
-
-TEST_P(UdpSocketTest, FIONREAD) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Check that the bound socket with an empty buffer reports an empty first
-  // packet.
-  int n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // Send 3 packets from t_ to s_.
-  constexpr int psize = 100;
-  char buf[3 * psize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
-                SyscallSucceedsWithValue(psize));
-
-    // Check that regardless of how many packets are in the queue, the size
-    // reported is that of a single packet.
-    n = -1;
-    EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-    EXPECT_EQ(n, psize);
-  }
-}
-
-TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // Check that the bound socket with an empty buffer reports an empty first
-  // packet.
-  int n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // Send 3 packets from t_ to s_.
-  constexpr int psize = 100;
-  char buf[3 * psize];
-  RandomizeBuffer(buf, sizeof(buf));
-
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_),
-                SyscallSucceedsWithValue(0));
-
-    // Check that regardless of how many packets are in the queue, the size
-    // reported is that of a single packet.
-    n = -1;
-    EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-    EXPECT_EQ(n, 0);
-  }
-}
-
-TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
-  int n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  // Bind s_ to loopback:TestPort.
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  // A UDP socket must be connected before it can be shutdown.
-  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  const char str[] = "abc";
-  ASSERT_THAT(send(s_, str, 0, 0), SyscallSucceedsWithValue(0));
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-
-  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
-  n = -1;
-  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, 0);
-}
-
-TEST_P(UdpSocketTest, ErrorQueue) {
-  char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))];
-  msghdr msg;
-  memset(&msg, 0, sizeof(msg));
-  iovec iov;
-  memset(&iov, 0, sizeof(iov));
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-  msg.msg_control = cmsgbuf;
-  msg.msg_controllen = sizeof(cmsgbuf);
-
-  // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
-  EXPECT_THAT(RetryEINTR(recvmsg)(s_, &msg, MSG_ERRQUEUE),
-              SyscallFailsWithErrno(EAGAIN));
-}
-
-TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
-  int v = -1;
-  socklen_t optlen = sizeof(v);
-  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
-              SyscallSucceeds());
-  ASSERT_EQ(v, kSockOptOff);
-  ASSERT_EQ(optlen, sizeof(v));
-}
-
-TEST_P(UdpSocketTest, SoTimestamp) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  int v = 1;
-  ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
-              SyscallSucceeds());
-
-  char buf[3];
-  // Send zero length packet from t_ to s_.
-  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
-
-  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
-  msghdr msg;
-  memset(&msg, 0, sizeof(msg));
-  iovec iov;
-  memset(&iov, 0, sizeof(iov));
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-  msg.msg_control = cmsgbuf;
-  msg.msg_controllen = sizeof(cmsgbuf);
-
-  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
-
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
-  ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP);
-  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval)));
-
-  struct timeval tv = {};
-  memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval));
-
-  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-
-  // There should be nothing to get via ioctl.
-  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
-}
-
-TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
-  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, TimestampIoctl) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  char buf[3];
-  // Send packet from t_ to s_.
-  ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
-              SyscallSucceedsWithValue(sizeof(buf)));
-
-  // There should be no control messages.
-  char recv_buf[sizeof(buf)];
-  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
-
-  // A nonzero timeval should be available via ioctl.
-  struct timeval tv = {};
-  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
-  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-}
-
-TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  struct timeval tv = {};
-  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
-}
-
-// Test that the timestamp accessed via SIOCGSTAMP is still accessible after
-// SO_TIMESTAMP is enabled and used to retrieve a timestamp.
-TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
-  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
-  char buf[3];
-  // Send packet from t_ to s_.
-  ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
-              SyscallSucceedsWithValue(sizeof(buf)));
-  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
-
-  // There should be no control messages.
-  char recv_buf[sizeof(buf)];
-  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
-
-  // A nonzero timeval should be available via ioctl.
-  struct timeval tv = {};
-  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
-  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-
-  // Enable SO_TIMESTAMP and send a message.
-  int v = 1;
-  EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
-              SyscallSucceeds());
-  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
-
-  // There should be a message for SO_TIMESTAMP.
-  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
-  msghdr msg = {};
-  iovec iov = {};
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-  msg.msg_control = cmsgbuf;
-  msg.msg_controllen = sizeof(cmsgbuf);
-  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
-  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  cmsg = CMSG_FIRSTHDR(&msg);
-  ASSERT_NE(cmsg, nullptr);
-
-  // The ioctl should return the exact same values as before.
-  struct timeval tv2 = {};
-  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv2), SyscallSucceeds());
-  ASSERT_EQ(tv.tv_sec, tv2.tv_sec);
-  ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
-}
-
 INSTANTIATE_TEST_SUITE_P(AllInetTests, UdpSocketTest,
                          ::testing::Values(AddressFamily::kIpv4,
                                            AddressFamily::kIpv6,
diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
new file mode 100644
index 000000000..147978f46
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
@@ -0,0 +1,54 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/udp_socket_test_cases.h"
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <linux/errqueue.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(UdpSocketTest, ErrorQueue) {
+  char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))];
+  msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  iovec iov;
+  memset(&iov, 0, sizeof(iov));
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+
+  // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
+  EXPECT_THAT(RetryEINTR(recvmsg)(s_, &msg, MSG_ERRQUEUE),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
new file mode 100644
index 000000000..b6090ac66
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -0,0 +1,1279 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/udp_socket_test_cases.h"
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Gets a pointer to the port component of the given address.
+uint16_t* Port(struct sockaddr_storage* addr) {
+  switch (addr->ss_family) {
+    case AF_INET: {
+      auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
+      return &sin->sin_port;
+    }
+    case AF_INET6: {
+      auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
+      return &sin6->sin6_port;
+    }
+  }
+
+  return nullptr;
+}
+
+void UdpSocketTest::SetUp() {
+  int type;
+  if (GetParam() == AddressFamily::kIpv4) {
+    type = AF_INET;
+    auto sin = reinterpret_cast<struct sockaddr_in*>(&anyaddr_storage_);
+    addrlen_ = sizeof(*sin);
+    sin->sin_addr.s_addr = htonl(INADDR_ANY);
+  } else {
+    type = AF_INET6;
+    auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&anyaddr_storage_);
+    addrlen_ = sizeof(*sin6);
+    if (GetParam() == AddressFamily::kIpv6) {
+      sin6->sin6_addr = IN6ADDR_ANY_INIT;
+    } else {
+      TestAddress const& v4_mapped_any = V4MappedAny();
+      sin6->sin6_addr =
+          reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
+              ->sin6_addr;
+    }
+  }
+  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
+
+  ASSERT_THAT(t_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
+
+  memset(&anyaddr_storage_, 0, sizeof(anyaddr_storage_));
+  anyaddr_ = reinterpret_cast<struct sockaddr*>(&anyaddr_storage_);
+  anyaddr_->sa_family = type;
+
+  if (gvisor::testing::IsRunningOnGvisor()) {
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+      ports_[i] = TestPort + i;
+    }
+  } else {
+    // When not under gvisor, use utility function to pick port. Assert that
+    // all ports are different.
+    std::string error;
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+      // Find an unused port, we specify port 0 to allow the kernel to provide
+      // the port.
+      bool unique = true;
+      do {
+        ports_[i] = ASSERT_NO_ERRNO_AND_VALUE(PortAvailable(
+            0, AddressFamily::kDualStack, SocketType::kUdp, false));
+        ASSERT_GT(ports_[i], 0);
+        for (size_t j = 0; j < i; ++j) {
+          if (ports_[j] == ports_[i]) {
+            unique = false;
+            break;
+          }
+        }
+      } while (!unique);
+    }
+  }
+
+  // Initialize the sockaddrs.
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(addr_); ++i) {
+    memset(&addr_storage_[i], 0, sizeof(addr_storage_[i]));
+
+    addr_[i] = reinterpret_cast<struct sockaddr*>(&addr_storage_[i]);
+    addr_[i]->sa_family = type;
+
+    switch (type) {
+      case AF_INET: {
+        auto sin = reinterpret_cast<struct sockaddr_in*>(addr_[i]);
+        sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+        sin->sin_port = htons(ports_[i]);
+        break;
+      }
+      case AF_INET6: {
+        auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr_[i]);
+        sin6->sin6_addr = in6addr_loopback;
+        sin6->sin6_port = htons(ports_[i]);
+        break;
+      }
+    }
+  }
+}
+
+TEST_P(UdpSocketTest, Creation) {
+  int type = AF_INET6;
+  if (GetParam() == AddressFamily::kIpv4) {
+    type = AF_INET;
+  }
+
+  int s_;
+
+  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
+  EXPECT_THAT(close(s_), SyscallSucceeds());
+
+  ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, 0), SyscallSucceeds());
+  EXPECT_THAT(close(s_), SyscallSucceeds());
+
+  ASSERT_THAT(s_ = socket(type, SOCK_STREAM, IPPROTO_UDP), SyscallFails());
+}
+
+TEST_P(UdpSocketTest, Getsockname) {
+  // Check that we're not bound.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, anyaddr_, addrlen_), 0);
+
+  // Bind, then check that we get the right address.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Getpeername) {
+  // Check that we're not connected.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallFailsWithErrno(ENOTCONN));
+
+  // Connect, then check that we get the right address.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, SendNotConnected) {
+  // Do send & write, they must fail.
+  char buf[512];
+  EXPECT_THAT(send(s_, buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(EDESTADDRREQ));
+
+  EXPECT_THAT(write(s_, buf, sizeof(buf)), SyscallFailsWithErrno(EDESTADDRREQ));
+
+  // Use sendto.
+  ASSERT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Check that we're bound now.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ConnectBinds) {
+  // Connect the socket.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that we're bound now.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveNotBound) {
+  char buf[512];
+  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, Bind) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Try to bind again.
+  EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
+
+  // Check that we're still bound to the original address.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, BindInUse) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Try to bind again.
+  EXPECT_THAT(bind(t_, addr_[0], addrlen_), SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(UdpSocketTest, ReceiveAfterConnect) {
+  // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Get the address s_ was bound to during connect.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+
+  // Send from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
+                     reinterpret_cast<sockaddr*>(&addr), addrlen),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[sizeof(buf)];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveAfterDisconnect) {
+  // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Get the address s_ was bound to during connect.
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  EXPECT_EQ(addrlen, addrlen_);
+
+  for (int i = 0; i < 2; i++) {
+    // Send from t_ to s_.
+    char buf[512];
+    RandomizeBuffer(buf, sizeof(buf));
+    EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+                SyscallSucceeds());
+    ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
+                       reinterpret_cast<sockaddr*>(&addr), addrlen),
+                SyscallSucceedsWithValue(sizeof(buf)));
+
+    // Receive the data.
+    char received[sizeof(buf)];
+    EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+                SyscallSucceedsWithValue(sizeof(received)));
+    EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+
+    // Disconnect s_.
+    struct sockaddr addr = {};
+    addr.sa_family = AF_UNSPEC;
+    ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), SyscallSucceeds());
+    // Connect s_ loopback:TestPort.
+    ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+  }
+}
+
+TEST_P(UdpSocketTest, Connect) {
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that we're connected to the right peer.
+  struct sockaddr_storage peer;
+  socklen_t peerlen = sizeof(peer);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+              SyscallSucceeds());
+  EXPECT_EQ(peerlen, addrlen_);
+  EXPECT_EQ(memcmp(&peer, addr_[0], addrlen_), 0);
+
+  // Try to bind after connect.
+  EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
+
+  // Try to connect again.
+  EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Check that peer name changed.
+  peerlen = sizeof(peer);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+              SyscallSucceeds());
+  EXPECT_EQ(peerlen, addrlen_);
+  EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
+}
+
+void ConnectAny(AddressFamily family, int sockfd, uint16_t port) {
+  struct sockaddr_storage addr = {};
+
+  // Precondition check.
+  {
+    socklen_t addrlen = sizeof(addr);
+    EXPECT_THAT(
+        getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+        SyscallSucceeds());
+
+    if (family == AddressFamily::kIpv4) {
+      auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+      EXPECT_EQ(addrlen, sizeof(*addr_out));
+      EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
+    } else {
+      auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+      EXPECT_EQ(addrlen, sizeof(*addr_out));
+      struct in6_addr any = IN6ADDR_ANY_INIT;
+      EXPECT_EQ(memcmp(&addr_out->sin6_addr, &any, sizeof(in6_addr)), 0);
+    }
+
+    {
+      socklen_t addrlen = sizeof(addr);
+      EXPECT_THAT(
+          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+          SyscallFailsWithErrno(ENOTCONN));
+    }
+
+    struct sockaddr_storage baddr = {};
+    if (family == AddressFamily::kIpv4) {
+      auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+      addrlen = sizeof(*addr_in);
+      addr_in->sin_family = AF_INET;
+      addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+      addr_in->sin_port = port;
+    } else {
+      auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+      addrlen = sizeof(*addr_in);
+      addr_in->sin6_family = AF_INET6;
+      addr_in->sin6_port = port;
+      if (family == AddressFamily::kIpv6) {
+        addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+      } else {
+        TestAddress const& v4_mapped_any = V4MappedAny();
+        addr_in->sin6_addr =
+            reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
+                ->sin6_addr;
+      }
+    }
+
+    // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
+    if (port == 0) {
+      SKIP_IF(IsRunningOnGvisor());
+    }
+
+    ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
+                SyscallSucceeds());
+  }
+
+  // Postcondition check.
+  {
+    socklen_t addrlen = sizeof(addr);
+    EXPECT_THAT(
+        getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+        SyscallSucceeds());
+
+    if (family == AddressFamily::kIpv4) {
+      auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+      EXPECT_EQ(addrlen, sizeof(*addr_out));
+      EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
+    } else {
+      auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+      EXPECT_EQ(addrlen, sizeof(*addr_out));
+      struct in6_addr loopback;
+      if (family == AddressFamily::kIpv6) {
+        loopback = IN6ADDR_LOOPBACK_INIT;
+      } else {
+        TestAddress const& v4_mapped_loopback = V4MappedLoopback();
+        loopback = reinterpret_cast<const struct sockaddr_in6*>(
+                       &v4_mapped_loopback.addr)
+                       ->sin6_addr;
+      }
+
+      EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+    }
+
+    addrlen = sizeof(addr);
+    if (port == 0) {
+      EXPECT_THAT(
+          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+          SyscallFailsWithErrno(ENOTCONN));
+    } else {
+      EXPECT_THAT(
+          getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+          SyscallSucceeds());
+    }
+  }
+}
+
+TEST_P(UdpSocketTest, ConnectAny) { ConnectAny(GetParam(), s_, 0); }
+
+TEST_P(UdpSocketTest, ConnectAnyWithPort) {
+  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+  ConnectAny(GetParam(), s_, port);
+}
+
+void DisconnectAfterConnectAny(AddressFamily family, int sockfd, int port) {
+  struct sockaddr_storage addr = {};
+
+  socklen_t addrlen = sizeof(addr);
+  struct sockaddr_storage baddr = {};
+  if (family == AddressFamily::kIpv4) {
+    auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+    addrlen = sizeof(*addr_in);
+    addr_in->sin_family = AF_INET;
+    addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+    addr_in->sin_port = port;
+  } else {
+    auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+    addrlen = sizeof(*addr_in);
+    addr_in->sin6_family = AF_INET6;
+    addr_in->sin6_port = port;
+    if (family == AddressFamily::kIpv6) {
+      addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+    } else {
+      TestAddress const& v4_mapped_any = V4MappedAny();
+      addr_in->sin6_addr =
+          reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
+              ->sin6_addr;
+    }
+  }
+
+  // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
+  if (port == 0) {
+    SKIP_IF(IsRunningOnGvisor());
+  }
+
+  ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
+              SyscallSucceeds());
+  // Now the socket is bound to the loopback address.
+
+  // Disconnect
+  addrlen = sizeof(addr);
+  addr.ss_family = AF_UNSPEC;
+  ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  // Check that after disconnect the socket is bound to the ANY address.
+  EXPECT_THAT(getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+  if (family == AddressFamily::kIpv4) {
+    auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
+  } else {
+    auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    struct in6_addr loopback = IN6ADDR_ANY_INIT;
+
+    EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+  }
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterConnectAny) {
+  DisconnectAfterConnectAny(GetParam(), s_, 0);
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) {
+  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+  DisconnectAfterConnectAny(GetParam(), s_, port);
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterBind) {
+  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
+  // Connect the socket.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  struct sockaddr_storage addr = {};
+  addr.ss_family = AF_UNSPEC;
+  EXPECT_THAT(
+      connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
+      SyscallSucceeds());
+
+  // Check that we're still bound.
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterBindToAny) {
+  struct sockaddr_storage baddr = {};
+  socklen_t addrlen;
+  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+  if (GetParam() == AddressFamily::kIpv4) {
+    auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+    addr_in->sin_family = AF_INET;
+    addr_in->sin_port = port;
+    addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+  } else {
+    auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+    addr_in->sin6_family = AF_INET6;
+    addr_in->sin6_port = port;
+    addr_in->sin6_scope_id = 0;
+    addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+  }
+  ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_),
+              SyscallSucceeds());
+  // Connect the socket.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  struct sockaddr_storage addr = {};
+  addr.ss_family = AF_UNSPEC;
+  EXPECT_THAT(
+      connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
+      SyscallSucceeds());
+
+  // Check that we're still bound.
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, &baddr, addrlen), 0);
+
+  addrlen = sizeof(addr);
+  EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, Disconnect) {
+  for (int i = 0; i < 2; i++) {
+    // Try to connect again.
+    EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+    // Check that we're connected to the right peer.
+    struct sockaddr_storage peer;
+    socklen_t peerlen = sizeof(peer);
+    EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+                SyscallSucceeds());
+    EXPECT_EQ(peerlen, addrlen_);
+    EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
+
+    // Try to disconnect.
+    struct sockaddr_storage addr = {};
+    addr.ss_family = AF_UNSPEC;
+    EXPECT_THAT(
+        connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
+        SyscallSucceeds());
+
+    peerlen = sizeof(peer);
+    EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+                SyscallFailsWithErrno(ENOTCONN));
+
+    // Check that we're still bound.
+    socklen_t addrlen = sizeof(addr);
+    EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+                SyscallSucceeds());
+    EXPECT_EQ(addrlen, addrlen_);
+    EXPECT_EQ(*Port(&addr), 0);
+  }
+}
+
+TEST_P(UdpSocketTest, ConnectBadAddress) {
+  struct sockaddr addr = {};
+  addr.sa_family = addr_[0]->sa_family;
+  ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send to a different destination than we're connected to.
+  char buf[512];
+  EXPECT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[1], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from s_ to t_.
+  ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+  // Receive the packet.
+  char received[3];
+  EXPECT_THAT(read(t_, received, sizeof(received)),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Set t_ to non-blocking.
+  int opts = 0;
+  ASSERT_THAT(opts = fcntl(t_, F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(t_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from s_ to t_.
+  ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+  // Receive the packet.
+  char received[3];
+  EXPECT_THAT(read(t_, received, sizeof(received)),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(read(t_, received, sizeof(received)),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveNotConnected) {
+  // Bind s_ to loopback.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send some data to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[sizeof(buf)];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveConnected) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data.
+  char received[sizeof(buf)];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveFromNotConnected) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+2.
+  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Check that the data isn't_ received because it was sent from a different
+  // address than we're connected.
+  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveBeforeConnect) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+2.
+  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Connect to loopback:TestPort+1.
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Receive the data. It works because it was sent before the connect.
+  char received[sizeof(buf)];
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+
+  // Send again. This time it should not be received.
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveFrom) {
+  // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Bind t_ to loopback:TestPort+1.
+  ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Send some data from t_ to s_.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // Receive the data and sender address.
+  char received[sizeof(buf)];
+  struct sockaddr_storage addr;
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(recvfrom(s_, received, sizeof(received), 0,
+                       reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceedsWithValue(sizeof(received)));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+  EXPECT_EQ(addrlen, addrlen_);
+  EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Listen) {
+  ASSERT_THAT(listen(s_, SOMAXCONN), SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_P(UdpSocketTest, Accept) {
+  ASSERT_THAT(accept(s_, nullptr, nullptr), SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+// This test validates that a read shutdown with pending data allows the read
+// to proceed with the data before returning EAGAIN.
+TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
+  char received[512];
+
+  // Bind t_ to loopback:TestPort+2.
+  ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+  // Connect the socket, then try to shutdown again.
+  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+  // Verify that we get EWOULDBLOCK when there is nothing to read.
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  const char* buf = "abc";
+  EXPECT_THAT(write(t_, buf, 3), SyscallSucceedsWithValue(3));
+
+  int opts = 0;
+  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+  ASSERT_THAT(fcntl(s_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
+  ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+  ASSERT_NE(opts & O_NONBLOCK, 0);
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  // We should get the data even though read has been shutdown.
+  EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2));
+
+  // Because we read less than the entire packet length, since it's a packet
+  // based socket any subsequent reads should return EWOULDBLOCK.
+  EXPECT_THAT(recv(s_, received, 1, 0), SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+// This test is validating that even after a socket is shutdown if it's
+// reconnected it will reset the shutdown state.
+TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then try to shutdown again.
+  ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReadShutdown) {
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then try to shutdown again.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
+  char received[512];
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Connect the socket, then shutdown from another thread.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  ScopedThread t([&] {
+    absl::SleepFor(absl::Milliseconds(200));
+    EXPECT_THAT(shutdown(this->s_, SHUT_RD), SyscallSucceeds());
+  });
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+  t.Join();
+
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, WriteShutdown) {
+  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds());
+}
+
+TEST_P(UdpSocketTest, SynchronousReceive) {
+  // Bind s_ to loopback.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send some data to s_ from another thread.
+  char buf[512];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  // Receive the data prior to actually starting the other thread.
+  char received[512];
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Start the thread.
+  ScopedThread t([&] {
+    absl::SleepFor(absl::Milliseconds(200));
+    ASSERT_THAT(
+        sendto(this->t_, buf, sizeof(buf), 0, this->addr_[0], this->addrlen_),
+        SyscallSucceedsWithValue(sizeof(buf)));
+  });
+
+  EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+              SyscallSucceedsWithValue(512));
+  EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send 3 packets from t_ to s_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
+                SyscallSucceedsWithValue(psize));
+  }
+
+  // Receive the data as 3 separate packets.
+  char received[6 * psize];
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_THAT(recv(s_, received + i * psize, 3 * psize, 0),
+                SyscallSucceedsWithValue(psize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 3 * psize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Direct writes from t_ to s_.
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send 2 packets from t_ to s_, where each packet's data consists of 2
+  // discontiguous iovecs.
+  constexpr size_t kPieceSize = 100;
+  char buf[4 * kPieceSize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[2];
+    for (int j = 0; j < 2; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    ASSERT_THAT(writev(t_, iov, 2), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+
+  // Receive the data as 2 separate packets.
+  char received[6 * kPieceSize];
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[3];
+    for (int j = 0; j < 3; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    ASSERT_THAT(readv(s_, iov, 3), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Send 2 packets from t_ to s_, where each packet's data consists of 2
+  // discontiguous iovecs.
+  constexpr size_t kPieceSize = 100;
+  char buf[4 * kPieceSize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[2];
+    for (int j = 0; j < 2; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    struct msghdr msg = {};
+    msg.msg_name = addr_[0];
+    msg.msg_namelen = addrlen_;
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 2;
+    ASSERT_THAT(sendmsg(t_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+
+  // Receive the data as 2 separate packets.
+  char received[6 * kPieceSize];
+  for (int i = 0; i < 2; i++) {
+    struct iovec iov[3];
+    for (int j = 0; j < 3; j++) {
+      iov[j].iov_base = reinterpret_cast<void*>(
+          reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+      iov[j].iov_len = kPieceSize;
+    }
+    struct msghdr msg = {};
+    msg.msg_iov = iov;
+    msg.msg_iovlen = 3;
+    ASSERT_THAT(recvmsg(s_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
+  }
+  EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADShutdown) {
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  const char str[] = "abc";
+  ASSERT_THAT(send(s_, str, sizeof(str), 0),
+              SyscallSucceedsWithValue(sizeof(str)));
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, sizeof(str));
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, sizeof(str));
+}
+
+TEST_P(UdpSocketTest, Fionread) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that the bound socket with an empty buffer reports an empty first
+  // packet.
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Send 3 packets from t_ to s_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
+                SyscallSucceedsWithValue(psize));
+
+    // Check that regardless of how many packets are in the queue, the size
+    // reported is that of a single packet.
+    n = -1;
+    EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+    EXPECT_EQ(n, psize);
+  }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Check that the bound socket with an empty buffer reports an empty first
+  // packet.
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Send 3 packets from t_ to s_.
+  constexpr int psize = 100;
+  char buf[3 * psize];
+  RandomizeBuffer(buf, sizeof(buf));
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_),
+                SyscallSucceedsWithValue(0));
+
+    // Check that regardless of how many packets are in the queue, the size
+    // reported is that of a single packet.
+    n = -1;
+    EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+    EXPECT_EQ(n, 0);
+  }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
+  int n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  // Bind s_ to loopback:TestPort.
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // A UDP socket must be connected before it can be shutdown.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  const char str[] = "abc";
+  ASSERT_THAT(send(s_, str, 0, 0), SyscallSucceedsWithValue(0));
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+
+  EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+  n = -1;
+  EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
+  int v = -1;
+  socklen_t optlen = sizeof(v);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
+              SyscallSucceeds());
+  ASSERT_EQ(v, kSockOptOff);
+  ASSERT_EQ(optlen, sizeof(v));
+}
+
+TEST_P(UdpSocketTest, SoTimestamp) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  int v = 1;
+  ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+              SyscallSucceeds());
+
+  char buf[3];
+  // Send zero length packet from t_ to s_.
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+  msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  iovec iov;
+  memset(&iov, 0, sizeof(iov));
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+  ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+  ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP);
+  ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval)));
+
+  struct timeval tv = {};
+  memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval));
+
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+  // There should be nothing to get via ioctl.
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
+  EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, TimestampIoctl) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send packet from t_ to s_.
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // There should be no control messages.
+  char recv_buf[sizeof(buf)];
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
+
+  // A nonzero timeval should be available via ioctl.
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+}
+
+TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
+}
+
+// Test that the timestamp accessed via SIOCGSTAMP is still accessible after
+// SO_TIMESTAMP is enabled and used to retrieve a timestamp.
+TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send packet from t_ to s_.
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+  // There should be no control messages.
+  char recv_buf[sizeof(buf)];
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
+
+  // A nonzero timeval should be available via ioctl.
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+  // Enable SO_TIMESTAMP and send a message.
+  int v = 1;
+  EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+              SyscallSucceeds());
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+  // There should be a message for SO_TIMESTAMP.
+  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+  msghdr msg = {};
+  iovec iov = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+
+  // The ioctl should return the exact same values as before.
+  struct timeval tv2 = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv2), SyscallSucceeds());
+  ASSERT_EQ(tv.tv_sec, tv2.tv_sec);
+  ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.h b/test/syscalls/linux/udp_socket_test_cases.h
new file mode 100644
index 000000000..2fd79d99e
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_test_cases.h
@@ -0,0 +1,74 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+#define THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// The initial port to be be used on gvisor.
+constexpr int TestPort = 40000;
+
+// Fixture for tests parameterized by the address family to use (AF_INET and
+// AF_INET6) when creating sockets.
+class UdpSocketTest
+    : public ::testing::TestWithParam<gvisor::testing::AddressFamily> {
+ protected:
+  // Creates two sockets that will be used by test cases.
+  void SetUp() override;
+
+  // Closes the sockets created by SetUp().
+  void TearDown() override {
+    EXPECT_THAT(close(s_), SyscallSucceeds());
+    EXPECT_THAT(close(t_), SyscallSucceeds());
+
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+      ASSERT_NO_ERRNO(FreeAvailablePort(ports_[i]));
+    }
+  }
+
+  // First UDP socket.
+  int s_;
+
+  // Second UDP socket.
+  int t_;
+
+  // The length of the socket address.
+  socklen_t addrlen_;
+
+  // Initialized address pointing to loopback and port TestPort+i.
+  struct sockaddr* addr_[3];
+
+  // Initialize "any" address.
+  struct sockaddr* anyaddr_;
+
+  // Used ports.
+  int ports_[3];
+
+ private:
+  // Storage for the loopback addresses.
+  struct sockaddr_storage addr_storage_[3];
+
+  // Storage for the "any" address.
+  struct sockaddr_storage anyaddr_storage_;
+};
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
-- 
cgit v1.2.3


From 1518f7fd38cc2367ee966443a5895a3f25621d83 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Mon, 2 Dec 2019 08:32:27 -0800
Subject: Fix typo, s/Convertable/Convertible/g

PiperOrigin-RevId: 283345791
---
 test/syscalls/linux/socket_ipv4_udp_unbound.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 6b1af6c17..aa6fb4e3f 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -1814,7 +1814,7 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReusePortThenReuseAddr) {
               SyscallFailsWithErrno(EADDRINUSE));
 }
 
-TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReusePort) {
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReusePort) {
   auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
@@ -1855,7 +1855,7 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReusePort) {
               SyscallFailsWithErrno(EADDRINUSE));
 }
 
-TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReuseAddr) {
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReuseAddr) {
   // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
   SKIP_IF(IsRunningOnGvisor());
 
-- 
cgit v1.2.3


From b41277049c6c6c15581d8698fd9418ef9c2cec8a Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 2 Dec 2019 15:35:51 -0800
Subject: test/syscal: Don't skip ClockGettime.CputimeId

We skipped it due to the issue in the golang scheduler
which has been fixed in go1.13.

PiperOrigin-RevId: 283432226
---
 test/syscalls/linux/clock_gettime.cc | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc
index 2aa91691e..7f6015049 100644
--- a/test/syscalls/linux/clock_gettime.cc
+++ b/test/syscalls/linux/clock_gettime.cc
@@ -56,11 +56,6 @@ void spin_ns(int64_t ns) {
 
 // Test that CLOCK_PROCESS_CPUTIME_ID is a superset of CLOCK_THREAD_CPUTIME_ID.
 TEST(ClockGettime, CputimeId) {
-  // TODO(b/128871825,golang.org/issue/10958): Test times out when there is a
-  // small number of core because one goroutine starves the others.
-  printf("CPUS: %d\n", std::thread::hardware_concurrency());
-  SKIP_IF(std::thread::hardware_concurrency() <= 2);
-
   constexpr int kNumThreads = 13;  // arbitrary
 
   absl::Duration spin_time = absl::Seconds(1);
-- 
cgit v1.2.3


From d7cc2480cb6e465ce01eb245e7edbad2c68c44d8 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 3 Dec 2019 12:45:43 -0800
Subject: Add RunfilesPath to test_util

A few tests have their own ad-hoc implementations. Add a single common one.

PiperOrigin-RevId: 283601666
---
 test/syscalls/linux/exec.cc        | 145 +++++++++++++++++--------------------
 test/syscalls/linux/sigaltstack.cc |   8 +-
 test/util/BUILD                    |   2 +
 test/util/test_util.h              |   6 ++
 test/util/test_util_runfiles.cc    |  46 ++++++++++++
 5 files changed, 123 insertions(+), 84 deletions(-)
 create mode 100644 test/util/test_util_runfiles.cc

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 581f03533..b5e0a512b 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -47,23 +47,14 @@ namespace testing {
 
 namespace {
 
-constexpr char kBasicWorkload[] = "exec_basic_workload";
-constexpr char kExitScript[] = "exit_script";
-constexpr char kStateWorkload[] = "exec_state_workload";
-constexpr char kProcExeWorkload[] = "exec_proc_exe_workload";
-constexpr char kAssertClosedWorkload[] = "exec_assert_closed_workload";
-constexpr char kPriorityWorkload[] = "priority_execve";
-
-std::string WorkloadPath(absl::string_view binary) {
-  std::string full_path;
-  char* test_src = getenv("TEST_SRCDIR");
-  if (test_src) {
-    full_path = JoinPath(test_src, "__main__/test/syscalls/linux", binary);
-  }
-
-  TEST_CHECK(full_path.empty() == false);
-  return full_path;
-}
+constexpr char kBasicWorkload[] = "test/syscalls/linux/exec_basic_workload";
+constexpr char kExitScript[] = "test/syscalls/linux/exit_script";
+constexpr char kStateWorkload[] = "test/syscalls/linux/exec_state_workload";
+constexpr char kProcExeWorkload[] =
+    "test/syscalls/linux/exec_proc_exe_workload";
+constexpr char kAssertClosedWorkload[] =
+    "test/syscalls/linux/exec_assert_closed_workload";
+constexpr char kPriorityWorkload[] = "test/syscalls/linux/priority_execve";
 
 constexpr char kExit42[] = "--exec_exit_42";
 constexpr char kExecWithThread[] = "--exec_exec_with_thread";
@@ -171,44 +162,44 @@ TEST(ExecTest, EmptyPath) {
 }
 
 TEST(ExecTest, Basic) {
-  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {},
+  CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)}, {},
             ArgEnvExitStatus(0, 0),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\n"));
+            absl::StrCat(RunfilePath(kBasicWorkload), "\n"));
 }
 
 TEST(ExecTest, OneArg) {
-  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "1"},
-            {}, ArgEnvExitStatus(1, 0),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
+  CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload), "1"}, {},
+            ArgEnvExitStatus(1, 0),
+            absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n"));
 }
 
 TEST(ExecTest, FiveArg) {
-  CheckExec(WorkloadPath(kBasicWorkload),
-            {WorkloadPath(kBasicWorkload), "1", "2", "3", "4", "5"}, {},
+  CheckExec(RunfilePath(kBasicWorkload),
+            {RunfilePath(kBasicWorkload), "1", "2", "3", "4", "5"}, {},
             ArgEnvExitStatus(5, 0),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+            absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
 }
 
 TEST(ExecTest, OneEnv) {
-  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {"1"},
+  CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)}, {"1"},
             ArgEnvExitStatus(0, 1),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
+            absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n"));
 }
 
 TEST(ExecTest, FiveEnv) {
-  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)},
+  CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)},
             {"1", "2", "3", "4", "5"}, ArgEnvExitStatus(0, 5),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+            absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
 }
 
 TEST(ExecTest, OneArgOneEnv) {
-  CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "arg"},
+  CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload), "arg"},
             {"env"}, ArgEnvExitStatus(1, 1),
-            absl::StrCat(WorkloadPath(kBasicWorkload), "\narg\nenv\n"));
+            absl::StrCat(RunfilePath(kBasicWorkload), "\narg\nenv\n"));
 }
 
 TEST(ExecTest, InterpreterScript) {
-  CheckExec(WorkloadPath(kExitScript), {WorkloadPath(kExitScript), "25"}, {},
+  CheckExec(RunfilePath(kExitScript), {RunfilePath(kExitScript), "25"}, {},
             ArgEnvExitStatus(25, 0), "");
 }
 
@@ -216,7 +207,7 @@ TEST(ExecTest, InterpreterScript) {
 TEST(ExecTest, InterpreterScriptArgSplit) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " foo bar"),
@@ -230,7 +221,7 @@ TEST(ExecTest, InterpreterScriptArgSplit) {
 TEST(ExecTest, InterpreterScriptArgvZero) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
@@ -244,7 +235,7 @@ TEST(ExecTest, InterpreterScriptArgvZero) {
 TEST(ExecTest, InterpreterScriptArgvZeroRelative) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
@@ -261,7 +252,7 @@ TEST(ExecTest, InterpreterScriptArgvZeroRelative) {
 TEST(ExecTest, InterpreterScriptArgvZeroAdded) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
@@ -274,7 +265,7 @@ TEST(ExecTest, InterpreterScriptArgvZeroAdded) {
 TEST(ExecTest, InterpreterScriptArgNUL) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(),
@@ -289,7 +280,7 @@ TEST(ExecTest, InterpreterScriptArgNUL) {
 TEST(ExecTest, InterpreterScriptTrailingWhitespace) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), "  "), 0755));
@@ -302,7 +293,7 @@ TEST(ExecTest, InterpreterScriptTrailingWhitespace) {
 TEST(ExecTest, InterpreterScriptArgWhitespace) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), "  foo"), 0755));
@@ -325,7 +316,7 @@ TEST(ExecTest, InterpreterScriptNoPath) {
 TEST(ExecTest, ExecFn) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kStateWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " PrintExecFn"),
@@ -342,7 +333,7 @@ TEST(ExecTest, ExecFn) {
 }
 
 TEST(ExecTest, ExecName) {
-  std::string path = WorkloadPath(kStateWorkload);
+  std::string path = RunfilePath(kStateWorkload);
 
   CheckExec(path, {path, "PrintExecName"}, {}, ArgEnvExitStatus(0, 0),
             absl::StrCat(Basename(path).substr(0, 15), "\n"));
@@ -351,7 +342,7 @@ TEST(ExecTest, ExecName) {
 TEST(ExecTest, ExecNameScript) {
   // Symlink through /tmp to ensure the path is short enough.
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kStateWorkload)));
 
   TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(),
@@ -405,13 +396,13 @@ TEST(ExecStateTest, HandlerReset) {
   ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
 
   ExecveArray args = {
-      WorkloadPath(kStateWorkload),
+      RunfilePath(kStateWorkload),
       "CheckSigHandler",
       absl::StrCat(SIGUSR1),
       absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_DFL))),
   };
 
-  CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+  CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
 }
 
 // Ignored signal dispositions are not reset.
@@ -421,13 +412,13 @@ TEST(ExecStateTest, IgnorePreserved) {
   ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
 
   ExecveArray args = {
-      WorkloadPath(kStateWorkload),
+      RunfilePath(kStateWorkload),
       "CheckSigHandler",
       absl::StrCat(SIGUSR1),
       absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_IGN))),
   };
 
-  CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+  CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
 }
 
 // Signal masks are not reset on exec
@@ -438,12 +429,12 @@ TEST(ExecStateTest, SignalMask) {
   ASSERT_THAT(sigprocmask(SIG_BLOCK, &s, nullptr), SyscallSucceeds());
 
   ExecveArray args = {
-      WorkloadPath(kStateWorkload),
+      RunfilePath(kStateWorkload),
       "CheckSigBlocked",
       absl::StrCat(SIGUSR1),
   };
 
-  CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+  CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
 }
 
 // itimers persist across execve.
@@ -471,7 +462,7 @@ TEST(ExecStateTest, ItimerPreserved) {
     }
   };
 
-  std::string filename = WorkloadPath(kStateWorkload);
+  std::string filename = RunfilePath(kStateWorkload);
   ExecveArray argv = {
       filename,
       "CheckItimerEnabled",
@@ -495,8 +486,8 @@ TEST(ExecStateTest, ItimerPreserved) {
 TEST(ProcSelfExe, ChangesAcrossExecve) {
   // See exec_proc_exe_workload for more details. We simply
   // assert that the /proc/self/exe link changes across execve.
-  CheckExec(WorkloadPath(kProcExeWorkload),
-            {WorkloadPath(kProcExeWorkload),
+  CheckExec(RunfilePath(kProcExeWorkload),
+            {RunfilePath(kProcExeWorkload),
              ASSERT_NO_ERRNO_AND_VALUE(ProcessExePath(getpid()))},
             {}, W_EXITCODE(0, 0), "");
 }
@@ -507,8 +498,8 @@ TEST(ExecTest, CloexecNormalFile) {
   const FileDescriptor fd_closed_on_exec =
       ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY | O_CLOEXEC));
 
-  CheckExec(WorkloadPath(kAssertClosedWorkload),
-            {WorkloadPath(kAssertClosedWorkload),
+  CheckExec(RunfilePath(kAssertClosedWorkload),
+            {RunfilePath(kAssertClosedWorkload),
              absl::StrCat(fd_closed_on_exec.get())},
             {}, W_EXITCODE(0, 0), "");
 
@@ -517,10 +508,10 @@ TEST(ExecTest, CloexecNormalFile) {
   const FileDescriptor fd_open_on_exec =
       ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY));
 
-  CheckExec(WorkloadPath(kAssertClosedWorkload),
-            {WorkloadPath(kAssertClosedWorkload),
-             absl::StrCat(fd_open_on_exec.get())},
-            {}, W_EXITCODE(2, 0), "");
+  CheckExec(
+      RunfilePath(kAssertClosedWorkload),
+      {RunfilePath(kAssertClosedWorkload), absl::StrCat(fd_open_on_exec.get())},
+      {}, W_EXITCODE(2, 0), "");
 }
 
 TEST(ExecTest, CloexecEventfd) {
@@ -528,15 +519,15 @@ TEST(ExecTest, CloexecEventfd) {
   ASSERT_THAT(efd = eventfd(0, EFD_CLOEXEC), SyscallSucceeds());
   FileDescriptor fd(efd);
 
-  CheckExec(WorkloadPath(kAssertClosedWorkload),
-            {WorkloadPath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {},
+  CheckExec(RunfilePath(kAssertClosedWorkload),
+            {RunfilePath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {},
             W_EXITCODE(0, 0), "");
 }
 
 constexpr int kLinuxMaxSymlinks = 40;
 
 TEST(ExecTest, SymlinkLimitExceeded) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
 
   // Hold onto TempPath objects so they are not destructed prematurely.
   std::vector<TempPath> symlinks;
@@ -575,13 +566,13 @@ TEST(ExecTest, SymlinkLimitRefreshedForInterpreter) {
 }
 
 TEST(ExecveatTest, BasicWithFDCWD) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   CheckExecveat(AT_FDCWD, path, {path}, {}, /*flags=*/0, ArgEnvExitStatus(0, 0),
                 absl::StrCat(path, "\n"));
 }
 
 TEST(ExecveatTest, Basic) {
-  std::string absolute_path = WorkloadPath(kBasicWorkload);
+  std::string absolute_path = RunfilePath(kBasicWorkload);
   std::string parent_dir = std::string(Dirname(absolute_path));
   std::string base = std::string(Basename(absolute_path));
   const FileDescriptor dirfd =
@@ -592,7 +583,7 @@ TEST(ExecveatTest, Basic) {
 }
 
 TEST(ExecveatTest, FDNotADirectory) {
-  std::string absolute_path = WorkloadPath(kBasicWorkload);
+  std::string absolute_path = RunfilePath(kBasicWorkload);
   std::string base = std::string(Basename(absolute_path));
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(absolute_path, 0));
 
@@ -604,13 +595,13 @@ TEST(ExecveatTest, FDNotADirectory) {
 }
 
 TEST(ExecveatTest, AbsolutePathWithFDCWD) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   CheckExecveat(AT_FDCWD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
                 absl::StrCat(path, "\n"));
 }
 
 TEST(ExecveatTest, AbsolutePath) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   // File descriptor should be ignored when an absolute path is given.
   const int32_t badFD = -1;
   CheckExecveat(badFD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
@@ -618,7 +609,7 @@ TEST(ExecveatTest, AbsolutePath) {
 }
 
 TEST(ExecveatTest, EmptyPathBasic) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
 
   CheckExecveat(fd.get(), "", {path}, {}, AT_EMPTY_PATH, ArgEnvExitStatus(0, 0),
@@ -626,7 +617,7 @@ TEST(ExecveatTest, EmptyPathBasic) {
 }
 
 TEST(ExecveatTest, EmptyPathWithDirFD) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   std::string parent_dir = std::string(Dirname(path));
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
@@ -639,7 +630,7 @@ TEST(ExecveatTest, EmptyPathWithDirFD) {
 }
 
 TEST(ExecveatTest, EmptyPathWithoutEmptyPathFlag) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
 
   int execve_errno;
@@ -649,7 +640,7 @@ TEST(ExecveatTest, EmptyPathWithoutEmptyPathFlag) {
 }
 
 TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
 
   CheckExecveat(fd.get(), path, {path}, {}, AT_EMPTY_PATH,
@@ -657,7 +648,7 @@ TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) {
 }
 
 TEST(ExecveatTest, RelativePathWithEmptyPathFlag) {
-  std::string absolute_path = WorkloadPath(kBasicWorkload);
+  std::string absolute_path = RunfilePath(kBasicWorkload);
   std::string parent_dir = std::string(Dirname(absolute_path));
   std::string base = std::string(Basename(absolute_path));
   const FileDescriptor dirfd =
@@ -670,7 +661,7 @@ TEST(ExecveatTest, RelativePathWithEmptyPathFlag) {
 TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) {
   std::string parent_dir = "/tmp";
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo(parent_dir, RunfilePath(kBasicWorkload)));
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
   std::string base = std::string(Basename(link.path()));
@@ -685,7 +676,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) {
 TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) {
   std::string parent_dir = "/tmp";
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo(parent_dir, RunfilePath(kBasicWorkload)));
   std::string path = link.path();
 
   int execve_errno;
@@ -697,7 +688,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) {
 
 TEST(ExecveatTest, SymlinkNoFollowAndEmptyPath) {
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+      TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
   std::string path = link.path();
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, 0));
 
@@ -723,7 +714,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
 }
 
 TEST(ExecveatTest, BasicWithCloexecFD) {
-  std::string path = WorkloadPath(kBasicWorkload);
+  std::string path = RunfilePath(kBasicWorkload);
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
 
   CheckExecveat(fd.get(), "", {path}, {}, AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH,
@@ -731,7 +722,7 @@ TEST(ExecveatTest, BasicWithCloexecFD) {
 }
 
 TEST(ExecveatTest, InterpreterScriptWithCloexecFD) {
-  std::string path = WorkloadPath(kExitScript);
+  std::string path = RunfilePath(kExitScript);
   const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
 
   int execve_errno;
@@ -742,7 +733,7 @@ TEST(ExecveatTest, InterpreterScriptWithCloexecFD) {
 }
 
 TEST(ExecveatTest, InterpreterScriptWithCloexecDirFD) {
-  std::string absolute_path = WorkloadPath(kExitScript);
+  std::string absolute_path = RunfilePath(kExitScript);
   std::string parent_dir = std::string(Dirname(absolute_path));
   std::string base = std::string(Basename(absolute_path));
   const FileDescriptor dirfd =
@@ -775,7 +766,7 @@ TEST(GetpriorityTest, ExecveMaintainsPriority) {
 
   // Program run (priority_execve) will exit(X) where
   // X=getpriority(PRIO_PROCESS,0). Check that this exit value is prio.
-  CheckExec(WorkloadPath(kPriorityWorkload), {WorkloadPath(kPriorityWorkload)},
+  CheckExec(RunfilePath(kPriorityWorkload), {RunfilePath(kPriorityWorkload)},
             {}, W_EXITCODE(expected_exit_code, 0), "");
 }
 
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index 6fd3989a4..a778fa639 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -95,13 +95,7 @@ TEST(SigaltstackTest, ResetByExecve) {
   auto const cleanup_sigstack =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack));
 
-  std::string full_path;
-  char* test_src = getenv("TEST_SRCDIR");
-  if (test_src) {
-    full_path = JoinPath(test_src, "../../linux/sigaltstack_check");
-  }
-
-  ASSERT_FALSE(full_path.empty());
+  std::string full_path = RunfilePath("test/syscalls/linux/sigaltstack_check");
 
   pid_t child_pid = -1;
   int execve_errno = 0;
diff --git a/test/util/BUILD b/test/util/BUILD
index 4526bb3f1..cbc728159 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -237,6 +237,7 @@ cc_library(
     ] + select_for_linux(
         [
             "test_util_impl.cc",
+            "test_util_runfiles.cc",
         ],
     ),
     hdrs = ["test_util.h"],
@@ -245,6 +246,7 @@ cc_library(
         ":logging",
         ":posix_error",
         ":save_util",
+        "@bazel_tools//tools/cpp/runfiles",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/flags:flag",
         "@com_google_absl//absl/flags:parse",
diff --git a/test/util/test_util.h b/test/util/test_util.h
index dc30575b8..ee6c2bf4d 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -764,6 +764,12 @@ MATCHER_P2(EquivalentWithin, target, tolerance,
   return Equivalent(arg, target, tolerance);
 }
 
+// Returns the absolute path to the a data dependency. 'path' is the runfile
+// location relative to workspace root.
+#ifdef __linux__
+std::string RunfilePath(std::string path);
+#endif
+
 void TestInit(int* argc, char*** argv);
 
 }  // namespace testing
diff --git a/test/util/test_util_runfiles.cc b/test/util/test_util_runfiles.cc
new file mode 100644
index 000000000..7210094eb
--- /dev/null
+++ b/test/util/test_util_runfiles.cc
@@ -0,0 +1,46 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+#include "tools/cpp/runfiles/runfiles.h"
+
+namespace gvisor {
+namespace testing {
+
+std::string RunfilePath(std::string path) {
+  static const bazel::tools::cpp::runfiles::Runfiles* const runfiles = [] {
+    std::string error;
+    auto* runfiles =
+        bazel::tools::cpp::runfiles::Runfiles::CreateForTest(&error);
+    if (runfiles == nullptr) {
+      std::cerr << "Unable to find runfiles: " << error << std::endl;
+    }
+    return runfiles;
+  }();
+
+  if (!runfiles) {
+    // Can't find runfiles? This probably won't work, but __main__/path is our
+    // best guess.
+    return JoinPath("__main__", path);
+  }
+
+  return runfiles->Rlocation(JoinPath("__main__", path));
+}
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 43643752f05a0b25259b116558ccd870a539cc05 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 3 Dec 2019 13:46:09 -0800
Subject: strace: don't create a slice with a negative value

PiperOrigin-RevId: 283613824
---
 pkg/sentry/strace/socket.go             |  9 +++++++++
 test/syscalls/linux/socket_unix_cmsg.cc | 29 +++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 94334f6d2..51f2efb39 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -208,6 +208,15 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 		i += linux.SizeOfControlMessageHeader
 		width := t.Arch().Width()
 		length := int(h.Length) - linux.SizeOfControlMessageHeader
+		if length < 0 {
+			strs = append(strs, fmt.Sprintf(
+				"{level=%s, type=%s, length=%d, content too short}",
+				level,
+				typ,
+				h.Length,
+			))
+			break
+		}
 
 		if skipData {
 			strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length))
diff --git a/test/syscalls/linux/socket_unix_cmsg.cc b/test/syscalls/linux/socket_unix_cmsg.cc
index 1159c5229..a16899493 100644
--- a/test/syscalls/linux/socket_unix_cmsg.cc
+++ b/test/syscalls/linux/socket_unix_cmsg.cc
@@ -149,6 +149,35 @@ TEST_P(UnixSocketPairCmsgTest, BadFDPass) {
               SyscallFailsWithErrno(EBADF));
 }
 
+TEST_P(UnixSocketPairCmsgTest, ShortCmsg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[20];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  int sent_fd = -1;
+
+  struct msghdr msg = {};
+  char control[CMSG_SPACE(sizeof(sent_fd))];
+  msg.msg_control = control;
+  msg.msg_controllen = sizeof(control);
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg->cmsg_len = 1;
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_RIGHTS;
+  memcpy(CMSG_DATA(cmsg), &sent_fd, sizeof(sent_fd));
+
+  struct iovec iov;
+  iov.iov_base = sent_data;
+  iov.iov_len = sizeof(sent_data);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 // BasicFDPassNoSpace starts off by sending a single FD just like BasicFDPass.
 // The difference is that when calling recvmsg, no space for FDs is provided,
 // only space for the cmsg header.
-- 
cgit v1.2.3


From 27e2c4ddca553cf6867bd49f2847ef007ac560c0 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 3 Dec 2019 14:40:22 -0800
Subject: Fix panic due to early transition to Closed.

The code in rcv.consumeSegment incorrectly transitions to
CLOSED state from LAST-ACK before the final ACK for the FIN.

Further if receiving a segment changes a socket to a closed state
then we should not invoke the sender as the socket is now closed
and sending any segments is incorrect.

PiperOrigin-RevId: 283625300
---
 pkg/tcpip/transport/tcp/connect.go           |  32 ++---
 pkg/tcpip/transport/tcp/rcv.go               |   2 +-
 pkg/tcpip/transport/tcp/tcp_test.go          | 179 +++++++++++++++++++++++++++
 test/syscalls/linux/BUILD                    |   1 +
 test/syscalls/linux/socket_ip_tcp_generic.cc |  23 ++++
 5 files changed, 222 insertions(+), 15 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4206db8b6..16f8aea12 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -953,20 +953,6 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 func (e *endpoint) handleSegments() *tcpip.Error {
 	checkRequeue := true
 	for i := 0; i < maxSegmentsPerWake; i++ {
-		e.mu.RLock()
-		state := e.state
-		e.mu.RUnlock()
-		if state == StateClose {
-			// When we get into StateClose while processing from the queue,
-			// return immediately and let the protocolMainloop handle it.
-			//
-			// We can reach StateClose only while processing a previous segment
-			// or a notification from the protocolMainLoop (caller goroutine).
-			// This means that with this return, the segment dequeue below can
-			// never occur on a closed endpoint.
-			return nil
-		}
-
 		s := e.segmentQueue.dequeue()
 		if s == nil {
 			checkRequeue = false
@@ -1024,6 +1010,24 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 				s.decRef()
 				continue
 			}
+
+			// Now check if the received segment has caused us to transition
+			// to a CLOSED state, if yes then terminate processing and do
+			// not invoke the sender.
+			e.mu.RLock()
+			state := e.state
+			e.mu.RUnlock()
+			if state == StateClose {
+				// When we get into StateClose while processing from the queue,
+				// return immediately and let the protocolMainloop handle it.
+				//
+				// We can reach StateClose only while processing a previous segment
+				// or a notification from the protocolMainLoop (caller goroutine).
+				// This means that with this return, the segment dequeue below can
+				// never occur on a closed endpoint.
+				s.decRef()
+				return nil
+			}
 			e.snd.handleRcvdSegment(s)
 		}
 		s.decRef()
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 857dc445f..5ee499c36 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -205,7 +205,7 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 
 	// Handle ACK (not FIN-ACK, which we handled above) during one of the
 	// shutdown states.
-	if s.flagIsSet(header.TCPFlagAck) {
+	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt {
 		r.ep.mu.Lock()
 		switch r.ep.state {
 		case StateFinWait1:
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 50829ae27..d1f0d6ce7 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -5632,6 +5632,7 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	// Receive the SYN-ACK reply.
@@ -5750,6 +5751,7 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	// Receive the SYN-ACK reply.
@@ -5856,6 +5858,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	// Receive the SYN-ACK reply.
@@ -5929,6 +5932,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	c.CheckNoPacketTimeout("unexpected packet received in response to SYN", 1*time.Second)
@@ -5941,6 +5945,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	// Receive the SYN-ACK reply.
@@ -6007,6 +6012,7 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 		DstPort: context.StackPort,
 		Flags:   header.TCPFlagSyn,
 		SeqNum:  iss,
+		RcvWnd:  30000,
 	})
 
 	// Receive the SYN-ACK reply.
@@ -6115,3 +6121,176 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 		checker.AckNum(uint32(ackHeaders.SeqNum)),
 		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
 }
+
+func TestTCPCloseWithData(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
+	// after 5 seconds in TIME_WAIT state.
+	tcpTimeWaitTimeout := 5 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	}
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+		RcvWnd:  30000,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Now trigger a passive close by sending a FIN.
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+		RcvWnd:  30000,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Now write a few bytes and then close the endpoint.
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that data is received.
+	b = c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(iss)+2), // Acknum is initial sequence number + 1
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	if p := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(data, p) {
+		t.Errorf("got data = %x, want = %x", p, data)
+	}
+
+	c.EP.Close()
+	// Check the FIN.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)+uint32(len(data))),
+		checker.AckNum(uint32(iss+2)),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	// First send a partial ACK.
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 2,
+		AckNum:  c.IRS + 1 + seqnum.Value(len(data)-1),
+		RcvWnd:  30000,
+	}
+	c.SendPacket(nil, ackHeaders)
+
+	// Now send a full ACK.
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 2,
+		AckNum:  c.IRS + 1 + seqnum.Value(len(data)),
+		RcvWnd:  30000,
+	}
+	c.SendPacket(nil, ackHeaders)
+
+	// Now ACK the FIN.
+	ackHeaders.AckNum++
+	c.SendPacket(nil, ackHeaders)
+
+	// Now send an ACK and we should get a RST back as the endpoint should
+	// be in CLOSED state.
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 2,
+		AckNum:  c.IRS + 1 + seqnum.Value(len(data)),
+		RcvWnd:  30000,
+	}
+	c.SendPacket(nil, ackHeaders)
+
+	// Check the RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(ackHeaders.AckNum)),
+		checker.AckNum(uint32(ackHeaders.SeqNum)),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+
+}
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 2dd115409..a865e8857 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2142,6 +2142,7 @@ cc_library(
         ":socket_test_util",
         "//test/util:test_util",
         "//test/util:thread_util",
+        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index a37b49447..c74273436 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -24,6 +24,8 @@
 #include <sys/un.h>
 
 #include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@@ -789,5 +791,26 @@ TEST_P(TCPSocketPairTest, SetTCPLingerTimeout) {
   EXPECT_EQ(get, kTCPLingerTimeout);
 }
 
+TEST_P(TCPSocketPairTest, TestTCPCloseWithData) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ScopedThread t([&]() {
+    // Close one end to trigger sending of a FIN.
+    ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_WR), SyscallSucceeds());
+    char buf[3];
+    ASSERT_THAT(read(sockets->second_fd(), buf, 3),
+                SyscallSucceedsWithValue(3));
+    absl::SleepFor(absl::Milliseconds(50));
+    ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+  });
+
+  absl::SleepFor(absl::Milliseconds(50));
+  // Send some data then close.
+  constexpr char kStr[] = "abc";
+  ASSERT_THAT(write(sockets->first_fd(), kStr, 3), SyscallSucceedsWithValue(3));
+  t.Join();
+  ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From cf7f27c16793eaa41743e96488dad2ddfd1f5d59 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 3 Dec 2019 16:30:38 -0800
Subject: net/udp: return a local route address as the bound-to address

If the socket is bound to ANY and connected to a loopback address,
getsockname() has to return the loopback address. Without this fix,
getsockname() returns ANY.

PiperOrigin-RevId: 283647781
---
 pkg/tcpip/transport/udp/endpoint.go          |  7 ++++-
 test/syscalls/linux/udp_socket_test_cases.cc | 39 ++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 24cb88c13..4b161e404 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1134,9 +1134,14 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
+	addr := e.ID.LocalAddress
+	if e.state == StateConnected {
+		addr = e.route.LocalAddress
+	}
+
 	return tcpip.FullAddress{
 		NIC:  e.RegisterNICID,
-		Addr: e.ID.LocalAddress,
+		Addr: addr,
 		Port: e.ID.LocalPort,
 	}, nil
 }
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index b6090ac66..63b92d6a7 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -527,6 +527,45 @@ TEST_P(UdpSocketTest, DisconnectAfterBind) {
               SyscallFailsWithErrno(ENOTCONN));
 }
 
+TEST_P(UdpSocketTest, BindToAnyConnnectToLocalhost) {
+  struct sockaddr_storage baddr = {};
+  auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+  if (GetParam() == AddressFamily::kIpv4) {
+    auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+    addr_in->sin_family = AF_INET;
+    addr_in->sin_port = port;
+    addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+  } else {
+    auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+    addr_in->sin6_family = AF_INET6;
+    addr_in->sin6_port = port;
+    addr_in->sin6_scope_id = 0;
+    addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+  }
+  ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_),
+              SyscallSucceeds());
+  // Connect the socket.
+  ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+  struct sockaddr_storage addr = {};
+  socklen_t addrlen = sizeof(addr);
+  EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  // If the socket is bound to ANY and connected to a loopback address,
+  // getsockname() has to return the loopback address.
+  if (GetParam() == AddressFamily::kIpv4) {
+    auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
+  } else {
+    auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+    struct in6_addr loopback = IN6ADDR_LOOPBACK_INIT;
+    EXPECT_EQ(addrlen, sizeof(*addr_out));
+    EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+  }
+}
+
 TEST_P(UdpSocketTest, DisconnectAfterBindToAny) {
   struct sockaddr_storage baddr = {};
   socklen_t addrlen;
-- 
cgit v1.2.3


From bb641c54035e79e3e4c2752e07e6ac55c620b93f Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 3 Dec 2019 17:32:27 -0800
Subject: Point TODO to gvisor.dev

PiperOrigin-RevId: 283657725
---
 test/syscalls/linux/aio.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index b27d4e10a..a33daff17 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -129,7 +129,7 @@ TEST_F(AIOTest, BasicWrite) {
   // aio implementation uses aio_ring. gVisor doesn't and returns all zeroes.
   // Linux implements aio_ring, so skip the zeroes check.
   //
-  // TODO(b/65486370): Remove when gVisor implements aio_ring.
+  // TODO(gvisor.dev/issue/204): Remove when gVisor implements aio_ring.
   auto ring = reinterpret_cast<struct aio_ring*>(ctx_);
   auto magic = IsRunningOnGvisor() ? 0 : AIO_RING_MAGIC;
   EXPECT_EQ(ring->magic, magic);
-- 
cgit v1.2.3


From 80b7ba0c9709c0c7f4c3aef5637d23225bcb866b Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 3 Dec 2019 19:40:56 -0800
Subject: Clean up readv_socket test suite.

Get rid of the SocketTest class, which is only extended by ReadvSocketTest.
Also, get rid of TCP sockets (which were unused anyway) from readv_socket.cc.
This is a very old test suite that isn't the right place for TCP loopback
tests.

PiperOrigin-RevId: 283672772
---
 test/syscalls/linux/BUILD           |  1 -
 test/syscalls/linux/file_base.h     | 89 -------------------------------------
 test/syscalls/linux/readv_common.cc | 43 +++++++++++++++++-
 test/syscalls/linux/readv_socket.cc | 45 ++++++++++++++++---
 4 files changed, 80 insertions(+), 98 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index a865e8857..9cca78a93 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1795,7 +1795,6 @@ cc_binary(
     name = "readv_socket_test",
     testonly = 1,
     srcs = [
-        "file_base.h",
         "readv_common.cc",
         "readv_common.h",
         "readv_socket.cc",
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 4e048320e..6f80bc97c 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -111,95 +111,6 @@ class FileTest : public ::testing::Test {
   int test_pipe_[2];
 };
 
-class SocketTest : public ::testing::Test {
- public:
-  void SetUp() override {
-    test_unix_stream_socket_[0] = -1;
-    test_unix_stream_socket_[1] = -1;
-    test_unix_dgram_socket_[0] = -1;
-    test_unix_dgram_socket_[1] = -1;
-    test_unix_seqpacket_socket_[0] = -1;
-    test_unix_seqpacket_socket_[1] = -1;
-    test_tcp_socket_[0] = -1;
-    test_tcp_socket_[1] = -1;
-
-    ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, test_unix_stream_socket_),
-                SyscallSucceeds());
-    ASSERT_THAT(fcntl(test_unix_stream_socket_[0], F_SETFL, O_NONBLOCK),
-                SyscallSucceeds());
-    ASSERT_THAT(socketpair(AF_UNIX, SOCK_DGRAM, 0, test_unix_dgram_socket_),
-                SyscallSucceeds());
-    ASSERT_THAT(fcntl(test_unix_dgram_socket_[0], F_SETFL, O_NONBLOCK),
-                SyscallSucceeds());
-    ASSERT_THAT(
-        socketpair(AF_UNIX, SOCK_SEQPACKET, 0, test_unix_seqpacket_socket_),
-        SyscallSucceeds());
-    ASSERT_THAT(fcntl(test_unix_seqpacket_socket_[0], F_SETFL, O_NONBLOCK),
-                SyscallSucceeds());
-  }
-
-  void TearDown() override {
-    close(test_unix_stream_socket_[0]);
-    close(test_unix_stream_socket_[1]);
-
-    close(test_unix_dgram_socket_[0]);
-    close(test_unix_dgram_socket_[1]);
-
-    close(test_unix_seqpacket_socket_[0]);
-    close(test_unix_seqpacket_socket_[1]);
-
-    close(test_tcp_socket_[0]);
-    close(test_tcp_socket_[1]);
-  }
-
-  int test_unix_stream_socket_[2];
-  int test_unix_dgram_socket_[2];
-  int test_unix_seqpacket_socket_[2];
-  int test_tcp_socket_[2];
-};
-
-// MatchesStringLength checks that a tuple argument of (struct iovec *, int)
-// corresponding to an iovec array and its length, contains data that matches
-// the string length strlen.
-MATCHER_P(MatchesStringLength, strlen, "") {
-  struct iovec* iovs = arg.first;
-  int niov = arg.second;
-  int offset = 0;
-  for (int i = 0; i < niov; i++) {
-    offset += iovs[i].iov_len;
-  }
-  if (offset != static_cast<int>(strlen)) {
-    *result_listener << offset;
-    return false;
-  }
-  return true;
-}
-
-// MatchesStringValue checks that a tuple argument of (struct iovec *, int)
-// corresponding to an iovec array and its length, contains data that matches
-// the string value str.
-MATCHER_P(MatchesStringValue, str, "") {
-  struct iovec* iovs = arg.first;
-  int len = strlen(str);
-  int niov = arg.second;
-  int offset = 0;
-  for (int i = 0; i < niov; i++) {
-    struct iovec iov = iovs[i];
-    if (len < offset) {
-      *result_listener << "strlen " << len << " < offset " << offset;
-      return false;
-    }
-    if (strncmp(static_cast<char*>(iov.iov_base), &str[offset], iov.iov_len)) {
-      absl::string_view iovec_string(static_cast<char*>(iov.iov_base),
-                                     iov.iov_len);
-      *result_listener << iovec_string << " @offset " << offset;
-      return false;
-    }
-    offset += iov.iov_len;
-  }
-  return true;
-}
-
 }  // namespace testing
 }  // namespace gvisor
 
diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc
index 9658f7d42..491d5f40f 100644
--- a/test/syscalls/linux/readv_common.cc
+++ b/test/syscalls/linux/readv_common.cc
@@ -19,12 +19,53 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/file_base.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
 namespace testing {
 
+// MatchesStringLength checks that a tuple argument of (struct iovec *, int)
+// corresponding to an iovec array and its length, contains data that matches
+// the string length strlen.
+MATCHER_P(MatchesStringLength, strlen, "") {
+  struct iovec* iovs = arg.first;
+  int niov = arg.second;
+  int offset = 0;
+  for (int i = 0; i < niov; i++) {
+    offset += iovs[i].iov_len;
+  }
+  if (offset != static_cast<int>(strlen)) {
+    *result_listener << offset;
+    return false;
+  }
+  return true;
+}
+
+// MatchesStringValue checks that a tuple argument of (struct iovec *, int)
+// corresponding to an iovec array and its length, contains data that matches
+// the string value str.
+MATCHER_P(MatchesStringValue, str, "") {
+  struct iovec* iovs = arg.first;
+  int len = strlen(str);
+  int niov = arg.second;
+  int offset = 0;
+  for (int i = 0; i < niov; i++) {
+    struct iovec iov = iovs[i];
+    if (len < offset) {
+      *result_listener << "strlen " << len << " < offset " << offset;
+      return false;
+    }
+    if (strncmp(static_cast<char*>(iov.iov_base), &str[offset], iov.iov_len)) {
+      absl::string_view iovec_string(static_cast<char*>(iov.iov_base),
+                                     iov.iov_len);
+      *result_listener << iovec_string << " @offset " << offset;
+      return false;
+    }
+    offset += iov.iov_len;
+  }
+  return true;
+}
+
 extern const char kReadvTestData[] =
     "127.0.0.1      localhost"
     ""
diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc
index 9b6972201..dd6fb7008 100644
--- a/test/syscalls/linux/readv_socket.cc
+++ b/test/syscalls/linux/readv_socket.cc
@@ -19,7 +19,6 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/file_base.h"
 #include "test/syscalls/linux/readv_common.h"
 #include "test/util/test_util.h"
 
@@ -28,9 +27,30 @@ namespace testing {
 
 namespace {
 
-class ReadvSocketTest : public SocketTest {
+class ReadvSocketTest : public ::testing::Test {
+ public:
   void SetUp() override {
-    SocketTest::SetUp();
+    test_unix_stream_socket_[0] = -1;
+    test_unix_stream_socket_[1] = -1;
+    test_unix_dgram_socket_[0] = -1;
+    test_unix_dgram_socket_[1] = -1;
+    test_unix_seqpacket_socket_[0] = -1;
+    test_unix_seqpacket_socket_[1] = -1;
+
+    ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, test_unix_stream_socket_),
+                SyscallSucceeds());
+    ASSERT_THAT(fcntl(test_unix_stream_socket_[0], F_SETFL, O_NONBLOCK),
+                SyscallSucceeds());
+    ASSERT_THAT(socketpair(AF_UNIX, SOCK_DGRAM, 0, test_unix_dgram_socket_),
+                SyscallSucceeds());
+    ASSERT_THAT(fcntl(test_unix_dgram_socket_[0], F_SETFL, O_NONBLOCK),
+                SyscallSucceeds());
+    ASSERT_THAT(
+        socketpair(AF_UNIX, SOCK_SEQPACKET, 0, test_unix_seqpacket_socket_),
+        SyscallSucceeds());
+    ASSERT_THAT(fcntl(test_unix_seqpacket_socket_[0], F_SETFL, O_NONBLOCK),
+                SyscallSucceeds());
+
     ASSERT_THAT(
         write(test_unix_stream_socket_[1], kReadvTestData, kReadvTestDataSize),
         SyscallSucceedsWithValue(kReadvTestDataSize));
@@ -40,11 +60,22 @@ class ReadvSocketTest : public SocketTest {
     ASSERT_THAT(write(test_unix_seqpacket_socket_[1], kReadvTestData,
                       kReadvTestDataSize),
                 SyscallSucceedsWithValue(kReadvTestDataSize));
-    // FIXME(b/69821513): Enable when possible.
-    // ASSERT_THAT(write(test_tcp_socket_[1], kReadvTestData,
-    // kReadvTestDataSize),
-    //             SyscallSucceedsWithValue(kReadvTestDataSize));
   }
+
+  void TearDown() override {
+    close(test_unix_stream_socket_[0]);
+    close(test_unix_stream_socket_[1]);
+
+    close(test_unix_dgram_socket_[0]);
+    close(test_unix_dgram_socket_[1]);
+
+    close(test_unix_seqpacket_socket_[0]);
+    close(test_unix_seqpacket_socket_[1]);
+  }
+
+  int test_unix_stream_socket_[2];
+  int test_unix_dgram_socket_[2];
+  int test_unix_seqpacket_socket_[2];
 };
 
 TEST_F(ReadvSocketTest, ReadOneBufferPerByte_StreamSocket) {
-- 
cgit v1.2.3


From 05758f34b2f65b7e6b118d3719cb8ce37eb4bc79 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 5 Dec 2019 05:43:52 -0800
Subject: Explicitly export files needed by other packages

PiperOrigin-RevId: 283955946
---
 test/syscalls/linux/BUILD | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 9cca78a93..7ce2e6270 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -6,6 +6,16 @@ package(
     licenses = ["notice"],
 )
 
+exports_files(
+    [
+        "socket.cc",
+        "socket_ipv4_udp_unbound_loopback.cc",
+        "tcp_socket.cc",
+        "udp_socket.cc",
+    ],
+    visibility = ["//:sandbox"],
+)
+
 cc_binary(
     name = "sigaltstack_check",
     testonly = 1,
-- 
cgit v1.2.3


From 0a32c0235744191947a6bf890031026e06788837 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Thu, 5 Dec 2019 13:22:31 -0800
Subject: Create correct file for /proc/[pid]/task/[tid]/io

PiperOrigin-RevId: 284038840
---
 pkg/sentry/fs/proc/task.go  | 32 +++++++++++++++++---------------
 test/syscalls/linux/proc.cc | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 15 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 2a598149d..0e46c5fb7 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -67,29 +67,28 @@ type taskDir struct {
 var _ fs.InodeOperations = (*taskDir)(nil)
 
 // newTaskDir creates a new proc task entry.
-func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool) *fs.Inode {
+func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
 	contents := map[string]*fs.Inode{
-		"auxv":    newAuxvec(t, msrc),
-		"cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
-		"comm":    newComm(t, msrc),
-		"environ": newExecArgInode(t, msrc, environExecArg),
-		"exe":     newExe(t, msrc),
-		"fd":      newFdDir(t, msrc),
-		"fdinfo":  newFdInfoDir(t, msrc),
-		"gid_map": newGIDMap(t, msrc),
-		// FIXME(b/123511468): create the correct io file for threads.
-		"io":        newIO(t, msrc),
+		"auxv":      newAuxvec(t, msrc),
+		"cmdline":   newExecArgInode(t, msrc, cmdlineExecArg),
+		"comm":      newComm(t, msrc),
+		"environ":   newExecArgInode(t, msrc, environExecArg),
+		"exe":       newExe(t, msrc),
+		"fd":        newFdDir(t, msrc),
+		"fdinfo":    newFdInfoDir(t, msrc),
+		"gid_map":   newGIDMap(t, msrc),
+		"io":        newIO(t, msrc, isThreadGroup),
 		"maps":      newMaps(t, msrc),
 		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
 		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
 		"ns":        newNamespaceDir(t, msrc),
 		"smaps":     newSmaps(t, msrc),
-		"stat":      newTaskStat(t, msrc, showSubtasks, p.pidns),
+		"stat":      newTaskStat(t, msrc, isThreadGroup, p.pidns),
 		"statm":     newStatm(t, msrc),
 		"status":    newStatus(t, msrc, p.pidns),
 		"uid_map":   newUIDMap(t, msrc),
 	}
-	if showSubtasks {
+	if isThreadGroup {
 		contents["task"] = p.newSubtasks(t, msrc)
 	}
 	if len(p.cgroupControllers) > 0 {
@@ -619,8 +618,11 @@ type ioData struct {
 	ioUsage
 }
 
-func newIO(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
+func newIO(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
+	if isThreadGroup {
+		return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
+	}
+	return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t}), msrc, fs.SpecialFile, t)
 }
 
 // NeedsUpdate returns whether the generation is old or not.
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 512de5ee0..8cf08991b 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -37,6 +37,7 @@
 #include <map>
 #include <memory>
 #include <ostream>
+#include <regex>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -51,6 +52,7 @@
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/util/capability_util.h"
@@ -1988,6 +1990,44 @@ TEST(Proc, GetdentsEnoent) {
               SyscallFailsWithErrno(ENOENT));
 }
 
+void CheckSyscwFromIOFile(const std::string& path, const std::string& regex) {
+  std::string output;
+  ASSERT_NO_ERRNO(GetContents(path, &output));
+  ASSERT_THAT(output, ContainsRegex(absl::StrCat("syscw:\\s+", regex, "\n")));
+}
+
+// Checks that there is variable accounting of IO between threads/tasks.
+TEST(Proc, PidTidIOAccounting) {
+  absl::Notification notification;
+
+  // Run a thread with a bunch of writes. Check that io account records exactly
+  // the number of write calls. File open/close is there to prevent buffering.
+  ScopedThread writer([&notification] {
+    const int num_writes = 100;
+    for (int i = 0; i < num_writes; i++) {
+      auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+      ASSERT_NO_ERRNO(SetContents(path.path(), "a"));
+    }
+    notification.Notify();
+    const std::string& writer_dir =
+        absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io");
+
+    CheckSyscwFromIOFile(writer_dir, std::to_string(num_writes));
+  });
+
+  // Run a thread and do no writes. Check that no writes are recorded.
+  ScopedThread noop([&notification] {
+    notification.WaitForNotification();
+    const std::string& noop_dir =
+        absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io");
+
+    CheckSyscwFromIOFile(noop_dir, "0");
+  });
+
+  writer.Join();
+  noop.Join();
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From f053c528122c246b4a454de54dacfffe0f7964f0 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 5 Dec 2019 13:55:37 -0800
Subject: Reduce flakiness under gotsan runs.

TcpPortReuseMultiThread creates lots of connections which result in
a lot of goroutines in the sentry. This can cause gotsan runs to
take really long and timeout. Increasing listen backlog and
reducing number of connections should help the connections complete
faster as well as reduce the number of goroutines that gotsan needs
to track.

PiperOrigin-RevId: 284046018
---
 test/syscalls/linux/socket_inet_loopback.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 96a1731cf..fa4358ae4 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -635,7 +635,9 @@ INSTANTIATE_TEST_SUITE_P(
 
 using SocketInetReusePortTest = ::testing::TestWithParam<TestParam>;
 
-TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
+// TODO(gvisor.dev/issue/940): Remove _NoRandomSave when portHint/stack.Seed is
+// saved/restored.
+TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -643,6 +645,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
   sockaddr_storage listen_addr = listener.addr;
   sockaddr_storage conn_addr = connector.addr;
   constexpr int kThreadCount = 3;
+  constexpr int kConnectAttempts = 4096;
 
   // Create the listening socket.
   FileDescriptor listener_fds[kThreadCount];
@@ -657,7 +660,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
     ASSERT_THAT(
         bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
         SyscallSucceeds());
-    ASSERT_THAT(listen(fd, 40), SyscallSucceeds());
+    ASSERT_THAT(listen(fd, kConnectAttempts / 3), SyscallSucceeds());
 
     // On the first bind we need to determine which port was bound.
     if (i != 0) {
@@ -676,7 +679,6 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
   }
 
-  constexpr int kConnectAttempts = 10000;
   std::atomic<int> connects_received = ATOMIC_VAR_INIT(0);
   std::unique_ptr<ScopedThread> listen_thread[kThreadCount];
   int accept_counts[kThreadCount] = {};
-- 
cgit v1.2.3


From 13f0f6069af4d49e236cbee4f0284c190784db37 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 5 Dec 2019 17:27:15 -0800
Subject: Implement F_GETOWN_EX and F_SETOWN_EX.

Some versions of glibc will convert F_GETOWN fcntl(2) calls into F_GETOWN_EX in
some cases.

PiperOrigin-RevId: 284089373
---
 pkg/abi/linux/fcntl.go                |  41 ++++++---
 pkg/sentry/syscalls/linux/sys_file.go |  70 +++++++++++++--
 test/syscalls/linux/BUILD             |   1 +
 test/syscalls/linux/fcntl.cc          | 162 +++++++++++++++++++++++++++++++++-
 test/syscalls/linux/ioctl.cc          |   3 +-
 5 files changed, 255 insertions(+), 22 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index f78315ebf..6663a199c 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -16,15 +16,17 @@ package linux
 
 // Commands from linux/fcntl.h.
 const (
-	F_DUPFD         = 0x0
-	F_GETFD         = 0x1
-	F_SETFD         = 0x2
-	F_GETFL         = 0x3
-	F_SETFL         = 0x4
-	F_SETLK         = 0x6
-	F_SETLKW        = 0x7
-	F_SETOWN        = 0x8
-	F_GETOWN        = 0x9
+	F_DUPFD         = 0
+	F_GETFD         = 1
+	F_SETFD         = 2
+	F_GETFL         = 3
+	F_SETFL         = 4
+	F_SETLK         = 6
+	F_SETLKW        = 7
+	F_SETOWN        = 8
+	F_GETOWN        = 9
+	F_SETOWN_EX     = 15
+	F_GETOWN_EX     = 16
 	F_DUPFD_CLOEXEC = 1024 + 6
 	F_SETPIPE_SZ    = 1024 + 7
 	F_GETPIPE_SZ    = 1024 + 8
@@ -32,9 +34,9 @@ const (
 
 // Commands for F_SETLK.
 const (
-	F_RDLCK = 0x0
-	F_WRLCK = 0x1
-	F_UNLCK = 0x2
+	F_RDLCK = 0
+	F_WRLCK = 1
+	F_UNLCK = 2
 )
 
 // Flags for fcntl.
@@ -42,7 +44,7 @@ const (
 	FD_CLOEXEC = 00000001
 )
 
-// Lock structure for F_SETLK.
+// Flock is the lock structure for F_SETLK.
 type Flock struct {
 	Type   int16
 	Whence int16
@@ -52,3 +54,16 @@ type Flock struct {
 	Pid    int32
 	_      [4]byte
 }
+
+// Flags for F_SETOWN_EX and F_GETOWN_EX.
+const (
+	F_OWNER_TID  = 0
+	F_OWNER_PID  = 1
+	F_OWNER_PGRP = 2
+)
+
+// FOwnerEx is the owner structure for F_SETOWN_EX and F_GETOWN_EX.
+type FOwnerEx struct {
+	Type int32
+	PID  int32
+}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 3b9181002..9bc2445a5 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -840,25 +840,42 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	return uintptr(newfd), nil, nil
 }
 
-func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+func fGetOwnEx(t *kernel.Task, file *fs.File) linux.FOwnerEx {
 	ma := file.Async(nil)
 	if ma == nil {
-		return 0
+		return linux.FOwnerEx{}
 	}
 	a := ma.(*fasync.FileAsync)
 	ot, otg, opg := a.Owner()
 	switch {
 	case ot != nil:
-		return int32(t.PIDNamespace().IDOfTask(ot))
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_TID,
+			PID:  int32(t.PIDNamespace().IDOfTask(ot)),
+		}
 	case otg != nil:
-		return int32(t.PIDNamespace().IDOfThreadGroup(otg))
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PID,
+			PID:  int32(t.PIDNamespace().IDOfThreadGroup(otg)),
+		}
 	case opg != nil:
-		return int32(-t.PIDNamespace().IDOfProcessGroup(opg))
+		return linux.FOwnerEx{
+			Type: linux.F_OWNER_PGRP,
+			PID:  int32(t.PIDNamespace().IDOfProcessGroup(opg)),
+		}
 	default:
-		return 0
+		return linux.FOwnerEx{}
 	}
 }
 
+func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+	owner := fGetOwnEx(t, file)
+	if owner.Type == linux.F_OWNER_PGRP {
+		return -owner.PID
+	}
+	return owner.PID
+}
+
 // fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux.
 //
 // If who is positive, it represents a PID. If negative, it represents a PGID.
@@ -901,11 +918,13 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		t.FDTable().SetFlags(fd, kernel.FDFlags{
 			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
+		return 0, nil, nil
 	case linux.F_GETFL:
 		return uintptr(file.Flags().ToLinux()), nil, nil
 	case linux.F_SETFL:
 		flags := uint(args[2].Uint())
 		file.SetFlags(linuxToFlags(flags).Settable())
+		return 0, nil, nil
 	case linux.F_SETLK, linux.F_SETLKW:
 		// In Linux the file system can choose to provide lock operations for an inode.
 		// Normally pipe and socket types lack lock operations. We diverge and use a heavy
@@ -1008,6 +1027,44 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.F_SETOWN:
 		fSetOwn(t, file, args[2].Int())
 		return 0, nil, nil
+	case linux.F_GETOWN_EX:
+		addr := args[2].Pointer()
+		owner := fGetOwnEx(t, file)
+		_, err := t.CopyOut(addr, &owner)
+		return 0, nil, err
+	case linux.F_SETOWN_EX:
+		addr := args[2].Pointer()
+		var owner linux.FOwnerEx
+		n, err := t.CopyIn(addr, &owner)
+		if err != nil {
+			return 0, nil, err
+		}
+		a := file.Async(fasync.New).(*fasync.FileAsync)
+		switch owner.Type {
+		case linux.F_OWNER_TID:
+			task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID))
+			if task == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerTask(t, task)
+			return uintptr(n), nil, nil
+		case linux.F_OWNER_PID:
+			tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID))
+			if tg == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerThreadGroup(t, tg)
+			return uintptr(n), nil, nil
+		case linux.F_OWNER_PGRP:
+			pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID))
+			if pg == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			a.SetOwnerProcessGroup(t, pg)
+			return uintptr(n), nil, nil
+		default:
+			return 0, nil, syserror.EINVAL
+		}
 	case linux.F_GET_SEALS:
 		val, err := tmpfs.GetSeals(file.Dirent.Inode)
 		return uintptr(val), nil, err
@@ -1035,7 +1092,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		// Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
 	}
-	return 0, nil, nil
 }
 
 const (
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 7ce2e6270..61f310db9 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -753,6 +753,7 @@ cc_binary(
         "//test/util:eventfd_util",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
+        "//test/util:save_util",
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:timer_util",
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 8a45be12a..4f3aa81d6 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -14,6 +14,7 @@
 
 #include <fcntl.h>
 #include <signal.h>
+#include <sys/types.h>
 #include <syscall.h>
 #include <unistd.h>
 
@@ -32,6 +33,7 @@
 #include "test/util/eventfd_util.h"
 #include "test/util/multiprocess_util.h"
 #include "test/util/posix_error.h"
+#include "test/util/save_util.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/timer_util.h"
@@ -910,8 +912,166 @@ TEST(FcntlTest, GetOwn) {
   FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
 
-  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), 0);
+  MaybeSave();
+}
+
+TEST(FcntlTest, GetOwnEx) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &owner),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST(FcntlTest, SetOwnExInvalidType) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = __pid_type(-1);
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(FcntlTest, SetOwnExInvalidTid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_TID;
+  owner.pid = -1;
+
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExInvalidPid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PID;
+  owner.pid = -1;
+
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExInvalidPgrp) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PGRP;
+  owner.pid = -1;
+
+  EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExTid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_TID;
+  EXPECT_THAT(owner.pid = syscall(__NR_gettid), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallSucceeds());
+
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid);
+  MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnExPid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PID;
+  EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallSucceeds());
+
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid);
+  MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnExPgrp) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex owner = {};
+  owner.type = F_OWNER_PGRP;
+  EXPECT_THAT(owner.pid = getpgrp(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+              SyscallSucceeds());
+
+  // NOTE(igudger): I don't understand why, but this is flaky on Linux.
+  // GetOwnExPgrp (below) does not have this issue.
+  SKIP_IF(!IsRunningOnGvisor());
+
+  EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), -owner.pid);
+  MaybeSave();
+}
+
+TEST(FcntlTest, GetOwnExTid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex set_owner = {};
+  set_owner.type = F_OWNER_TID;
+  EXPECT_THAT(set_owner.pid = syscall(__NR_gettid), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+              SyscallSucceeds());
+
+  f_owner_ex got_owner = {};
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(got_owner.type, set_owner.type);
+  EXPECT_EQ(got_owner.pid, set_owner.pid);
+}
+
+TEST(FcntlTest, GetOwnExPid) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex set_owner = {};
+  set_owner.type = F_OWNER_PID;
+  EXPECT_THAT(set_owner.pid = getpid(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+              SyscallSucceeds());
+
+  f_owner_ex got_owner = {};
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(got_owner.type, set_owner.type);
+  EXPECT_EQ(got_owner.pid, set_owner.pid);
+}
+
+TEST(FcntlTest, GetOwnExPgrp) {
+  FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+  f_owner_ex set_owner = {};
+  set_owner.type = F_OWNER_PGRP;
+  EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds());
+
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+              SyscallSucceeds());
+
+  f_owner_ex got_owner = {};
+  ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
               SyscallSucceedsWithValue(0));
+  EXPECT_EQ(got_owner.type, set_owner.type);
+  EXPECT_EQ(got_owner.pid, set_owner.pid);
 }
 
 }  // namespace
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
index c4f8bff08..b0a07a064 100644
--- a/test/syscalls/linux/ioctl.cc
+++ b/test/syscalls/linux/ioctl.cc
@@ -215,7 +215,8 @@ TEST_F(IoctlTest, FIOASYNCSelfTarget2) {
   auto mask_cleanup =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
 
-  pid_t pid = getpid();
+  pid_t pid = -1;
+  EXPECT_THAT(pid = getpid(), SyscallSucceeds());
   EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
 
   int set = 1;
-- 
cgit v1.2.3


From b0066217ecd830be1d816d2b4d824f89b278c556 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 6 Dec 2019 12:12:27 -0800
Subject: Add hostinet tests for UDP sockets.

We need to skip a subset of the tests, because of features that hostinet does
not currently support.

Fixes #1209

PiperOrigin-RevId: 284235911
---
 test/syscalls/BUILD                          |  1 +
 test/syscalls/linux/udp_socket_test_cases.cc | 35 ++++++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 722d14b53..6650984fa 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -669,6 +669,7 @@ syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
 syscall_test(
     size = "medium",
+    add_hostinet = True,
     shard_count = 10,
     test = "//test/syscalls/linux:udp_socket_test",
 )
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 63b92d6a7..4556f16d6 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -656,6 +656,9 @@ TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
 }
 
 TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
+  // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+  SKIP_IF(IsRunningWithHostinet());
+
   // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
@@ -673,6 +676,9 @@ TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
 }
 
 TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
+  // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+  SKIP_IF(IsRunningWithHostinet());
+
   // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
@@ -878,6 +884,10 @@ TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
 }
 
 TEST_P(UdpSocketTest, ReadShutdown) {
+  // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+  // MSG_DONTWAIT blocks indefinitely.
+  SKIP_IF(IsRunningWithHostinet());
+
   char received[512];
   EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
               SyscallFailsWithErrno(EWOULDBLOCK));
@@ -900,6 +910,10 @@ TEST_P(UdpSocketTest, ReadShutdown) {
 }
 
 TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
+  // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+  // MSG_DONTWAIT blocks indefinitely.
+  SKIP_IF(IsRunningWithHostinet());
+
   char received[512];
   EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
               SyscallFailsWithErrno(EWOULDBLOCK));
@@ -1189,6 +1203,10 @@ TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
 }
 
 TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
+  // TODO(gvisor.dev/issue/1202): SO_TIMESTAMP socket option not supported by
+  // hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   int v = -1;
   socklen_t optlen = sizeof(v);
   ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
@@ -1198,6 +1216,10 @@ TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
 }
 
 TEST_P(UdpSocketTest, SoTimestamp) {
+  // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+  // supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1241,6 +1263,9 @@ TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
 }
 
 TEST_P(UdpSocketTest, TimestampIoctl) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1259,7 +1284,10 @@ TEST_P(UdpSocketTest, TimestampIoctl) {
   ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
 }
 
-TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
+TEST_P(UdpSocketTest, TimestampIoctlNothingRead) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1270,6 +1298,10 @@ TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
 // Test that the timestamp accessed via SIOCGSTAMP is still accessible after
 // SO_TIMESTAMP is enabled and used to retrieve a timestamp.
 TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
+  // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+  // supported by hostinet.
+  SKIP_IF(IsRunningWithHostinet());
+
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1304,7 +1336,6 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
   msg.msg_controllen = sizeof(cmsgbuf);
   ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
   struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
-  cmsg = CMSG_FIRSTHDR(&msg);
   ASSERT_NE(cmsg, nullptr);
 
   // The ioctl should return the exact same values as before.
-- 
cgit v1.2.3


From 498595d54347d711dbd24247ed12c659b9d89c58 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 9 Dec 2019 11:21:01 -0800
Subject: Add tests for rseq(2)

Add a decent set of syscall tests for rseq(2). These are a bit awkward because
of issues with library integration. libc may register rseq on thread start
(including before main on the initial thread), precluding much testing. Thus we
run tests in a libc-free subprocess.

Support for rseq(2) in gVisor will come in a later commit.

PiperOrigin-RevId: 284595994
---
 test/syscalls/BUILD                 |   2 +
 test/syscalls/linux/BUILD           |  16 ++
 test/syscalls/linux/rseq.cc         | 198 +++++++++++++++++++
 test/syscalls/linux/rseq/BUILD      |  59 ++++++
 test/syscalls/linux/rseq/critical.S |  66 +++++++
 test/syscalls/linux/rseq/critical.h |  39 ++++
 test/syscalls/linux/rseq/rseq.cc    | 366 ++++++++++++++++++++++++++++++++++++
 test/syscalls/linux/rseq/start.S    |  45 +++++
 test/syscalls/linux/rseq/syscalls.h |  66 +++++++
 test/syscalls/linux/rseq/test.h     |  43 +++++
 test/syscalls/linux/rseq/types.h    |  31 +++
 test/syscalls/linux/rseq/uapi.h     |  54 ++++++
 12 files changed, 985 insertions(+)
 create mode 100644 test/syscalls/linux/rseq.cc
 create mode 100644 test/syscalls/linux/rseq/BUILD
 create mode 100644 test/syscalls/linux/rseq/critical.S
 create mode 100644 test/syscalls/linux/rseq/critical.h
 create mode 100644 test/syscalls/linux/rseq/rseq.cc
 create mode 100644 test/syscalls/linux/rseq/start.S
 create mode 100644 test/syscalls/linux/rseq/syscalls.h
 create mode 100644 test/syscalls/linux/rseq/test.h
 create mode 100644 test/syscalls/linux/rseq/types.h
 create mode 100644 test/syscalls/linux/rseq/uapi.h

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 6650984fa..829693e8e 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -376,6 +376,8 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:rlimits_test")
 
+syscall_test(test = "//test/syscalls/linux:rseq_test")
+
 syscall_test(test = "//test/syscalls/linux:rtsignal_test")
 
 syscall_test(test = "//test/syscalls/linux:sched_test")
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 61f310db9..c49445d62 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1852,6 +1852,22 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "rseq_test",
+    testonly = 1,
+    srcs = ["rseq.cc"],
+    data = ["//test/syscalls/linux/rseq"],
+    linkstatic = 1,
+    deps = [
+        "//test/syscalls/linux/rseq:lib",
+        "//test/util:logging",
+        "//test/util:multiprocess_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "rtsignal_test",
     testonly = 1,
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
new file mode 100644
index 000000000..106c045e3
--- /dev/null
+++ b/test/syscalls/linux/rseq.cc
@@ -0,0 +1,198 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/rseq/test.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Syscall test for rseq (restartable sequences).
+//
+// We must be very careful about how these tests are written. Each thread may
+// only have one struct rseq registration, which may be done automatically at
+// thread start (as of 2019-11-13, glibc does *not* support rseq and thus does
+// not do so).
+//
+// Testing of rseq is thus done primarily in a child process with no
+// registration. This means exec'ing a nostdlib binary, as rseq registration can
+// only be cleared by execve (or knowing the old rseq address), and glibc (based
+// on the current unmerged patches) register rseq before calling main()).
+
+int RSeq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
+  return syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
+}
+
+// Returns true if this kernel supports the rseq syscall.
+PosixErrorOr<bool> RSeqSupported() {
+  // We have to be careful here, there are three possible cases:
+  //
+  // 1. rseq is not supported -> ENOSYS
+  // 2. rseq is supported and not registered -> success, but we should
+  //    unregister.
+  // 3. rseq is supported and registered -> EINVAL (most likely).
+
+  // The only validation done on new registrations is that rseq is aligned and
+  // writable.
+  rseq rseq = {};
+  int ret = RSeq(&rseq, sizeof(rseq), 0, 0);
+  if (ret == 0) {
+    // Successfully registered, rseq is supported. Unregister.
+    ret = RSeq(&rseq, sizeof(rseq), kRseqFlagUnregister, 0);
+    if (ret != 0) {
+      return PosixError(errno);
+    }
+    return true;
+  }
+
+  switch (errno) {
+    case ENOSYS:
+      // Not supported.
+      return false;
+    case EINVAL:
+      // Supported, but already registered. EINVAL returned because we provided
+      // a different address.
+      return true;
+    default:
+      // Unknown error.
+      return PosixError(errno);
+  }
+}
+
+constexpr char kRseqBinary[] = "test/syscalls/linux/rseq/rseq";
+
+void RunChildTest(std::string test_case, int want_status) {
+  std::string path = RunfilePath(kRseqBinary);
+
+  pid_t child_pid = -1;
+  int execve_errno = 0;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(path, {path, test_case}, {}, &child_pid, &execve_errno));
+
+  ASSERT_GT(child_pid, 0);
+  ASSERT_EQ(execve_errno, 0);
+
+  int status = 0;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+  ASSERT_EQ(status, want_status);
+}
+
+// Test that rseq must be aligned.
+TEST(RseqTest, Unaligned) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestUnaligned, 0);
+}
+
+// Sanity test that registration works.
+TEST(RseqTest, Register) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestRegister, 0);
+}
+
+// Registration can't be done twice.
+TEST(RseqTest, DoubleRegister) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestDoubleRegister, 0);
+}
+
+// Registration can be done again after unregister.
+TEST(RseqTest, RegisterUnregister) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestRegisterUnregister, 0);
+}
+
+// The pointer to rseq must match on register/unregister.
+TEST(RseqTest, UnregisterDifferentPtr) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestUnregisterDifferentPtr, 0);
+}
+
+// The signature must match on register/unregister.
+TEST(RseqTest, UnregisterDifferentSignature) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestUnregisterDifferentSignature, 0);
+}
+
+// The CPU ID is initialized.
+TEST(RseqTest, CPU) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestCPU, 0);
+}
+
+// Critical section is eventually aborted.
+TEST(RseqTest, Abort) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbort, 0);
+}
+
+// Abort may be before the critical section.
+TEST(RseqTest, AbortBefore) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortBefore, 0);
+}
+
+// Signature must match.
+TEST(RseqTest, AbortSignature) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortSignature, SIGSEGV);
+}
+
+// Abort must not be in the critical section.
+TEST(RseqTest, AbortPreCommit) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortPreCommit, SIGSEGV);
+}
+
+// rseq.rseq_cs is cleared on abort.
+TEST(RseqTest, AbortClearsCS) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestAbortClearsCS, 0);
+}
+
+// rseq.rseq_cs is cleared on abort outside of critical section.
+TEST(RseqTest, InvalidAbortClearsCS) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+  RunChildTest(kRseqTestInvalidAbortClearsCS, 0);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD
new file mode 100644
index 000000000..5cfe4e56f
--- /dev/null
+++ b/test/syscalls/linux/rseq/BUILD
@@ -0,0 +1,59 @@
+# This package contains a standalone rseq test binary. This binary must not
+# depend on libc, which might use rseq itself.
+
+load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier")
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(licenses = ["notice"])
+
+genrule(
+    name = "rseq_binary",
+    srcs = [
+        "critical.h",
+        "critical.S",
+        "rseq.cc",
+        "syscalls.h",
+        "start.S",
+        "test.h",
+        "types.h",
+        "uapi.h",
+    ],
+    outs = ["rseq"],
+    cmd = " ".join([
+        "$(CC)",
+        "$(CC_FLAGS) ",
+        "-I.",
+        "-Wall",
+        "-Werror",
+        "-O2",
+        "-std=c++17",
+        "-static",
+        "-nostdlib",
+        "-ffreestanding",
+        "-o",
+        "$(location rseq)",
+        "$(location critical.S)",
+        "$(location rseq.cc)",
+        "$(location start.S)",
+    ]),
+    toolchains = [
+        ":no_pie_cc_flags",
+        "@bazel_tools//tools/cpp:current_cc_toolchain",
+    ],
+    visibility = ["//:sandbox"],
+)
+
+cc_flags_supplier(
+    name = "no_pie_cc_flags",
+    features = ["-pie"],
+)
+
+cc_library(
+    name = "lib",
+    testonly = 1,
+    hdrs = [
+        "test.h",
+        "uapi.h",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/test/syscalls/linux/rseq/critical.S b/test/syscalls/linux/rseq/critical.S
new file mode 100644
index 000000000..8c0687e6d
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical.S
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Restartable sequences critical sections.
+
+// Loops continuously until aborted.
+//
+// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
+
+  .text
+  .globl  rseq_loop
+  .type   rseq_loop, @function
+
+rseq_loop:
+  jmp begin
+
+  // Abort block before the critical section.
+  // Abort signature is 4 nops for simplicity.
+  .byte 0x90, 0x90, 0x90, 0x90
+  .globl  rseq_loop_early_abort
+rseq_loop_early_abort:
+  ret
+
+begin:
+  // r->rseq_cs = cs
+  movq %rsi, 8(%rdi)
+
+  // N.B. rseq_cs will be cleared by any preempt, even outside the critical
+  // section. Thus it must be set in or immediately before the critical section
+  // to ensure it is not cleared before the section begins.
+  .globl  rseq_loop_start
+rseq_loop_start:
+  jmp rseq_loop_start
+
+  // "Pre-commit": extra instructions inside the critical section.  These are
+  // used as the abort point in TestAbortPreCommit, which is not valid.
+  .globl  rseq_loop_pre_commit
+rseq_loop_pre_commit:
+  // Extra abort signature + nop for TestAbortPostCommit.
+  .byte 0x90, 0x90, 0x90, 0x90
+  nop
+
+  // "Post-commit": never reached in this case.
+  .globl  rseq_loop_post_commit
+rseq_loop_post_commit:
+
+  // Abort signature is 4 nops for simplicity.
+  .byte 0x90, 0x90, 0x90, 0x90
+
+  .globl  rseq_loop_abort
+rseq_loop_abort:
+  ret
+
+  .size  rseq_loop,.-rseq_loop
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/critical.h b/test/syscalls/linux/rseq/critical.h
new file mode 100644
index 000000000..ac987a25e
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical.h
@@ -0,0 +1,39 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
+
+#include "test/syscalls/linux/rseq/types.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+
+constexpr uint32_t kRseqSignature = 0x90909090;
+
+extern "C" {
+
+extern void rseq_loop(struct rseq* r, struct rseq_cs* cs);
+extern void* rseq_loop_early_abort;
+extern void* rseq_loop_start;
+extern void* rseq_loop_pre_commit;
+extern void* rseq_loop_post_commit;
+extern void* rseq_loop_abort;
+
+extern int rseq_getpid(struct rseq* r, struct rseq_cs* cs);
+extern void* rseq_getpid_start;
+extern void* rseq_getpid_post_commit;
+extern void* rseq_getpid_abort;
+
+}  // extern "C"
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
diff --git a/test/syscalls/linux/rseq/rseq.cc b/test/syscalls/linux/rseq/rseq.cc
new file mode 100644
index 000000000..f036db26d
--- /dev/null
+++ b/test/syscalls/linux/rseq/rseq.cc
@@ -0,0 +1,366 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/rseq/critical.h"
+#include "test/syscalls/linux/rseq/syscalls.h"
+#include "test/syscalls/linux/rseq/test.h"
+#include "test/syscalls/linux/rseq/types.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+
+namespace gvisor {
+namespace testing {
+
+extern "C" int main(int argc, char** argv, char** envp);
+
+// Standalone initialization before calling main().
+extern "C" void __init(uintptr_t* sp) {
+  int argc = sp[0];
+  char** argv = reinterpret_cast<char**>(&sp[1]);
+  char** envp = &argv[argc + 1];
+
+  // Call main() and exit.
+  sys_exit_group(main(argc, argv, envp));
+
+  // sys_exit_group does not return
+}
+
+int strcmp(const char* s1, const char* s2) {
+  const unsigned char* p1 = reinterpret_cast<const unsigned char*>(s1);
+  const unsigned char* p2 = reinterpret_cast<const unsigned char*>(s2);
+
+  while (*p1 == *p2) {
+    if (!*p1) {
+      return 0;
+    }
+    ++p1;
+    ++p2;
+  }
+  return static_cast<int>(*p1) - static_cast<int>(*p2);
+}
+
+int sys_rseq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
+  return raw_syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
+}
+
+// Test that rseq must be aligned.
+int TestUnaligned() {
+  constexpr uintptr_t kRequiredAlignment = alignof(rseq);
+
+  char buf[2 * kRequiredAlignment] = {};
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(&buf[0]);
+  if ((ptr & (kRequiredAlignment - 1)) == 0) {
+    // buf is already aligned. Misalign it.
+    ptr++;
+  }
+
+  int ret = sys_rseq(reinterpret_cast<rseq*>(ptr), sizeof(rseq), 0, 0);
+  if (sys_errno(ret) != EINVAL) {
+    return 1;
+  }
+  return 0;
+}
+
+// Sanity test that registration works.
+int TestRegister() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+  return 0;
+};
+
+// Registration can't be done twice.
+int TestDoubleRegister() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != EBUSY) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// Registration can be done again after unregister.
+int TestRegisterUnregister() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, 0);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// The pointer to rseq must match on register/unregister.
+int TestUnregisterDifferentPtr() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq r2 = {};
+  if (int ret = sys_rseq(&r2, sizeof(r2), kRseqFlagUnregister, 0);
+      sys_errno(ret) != EINVAL) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// The signature must match on register/unregister.
+int TestUnregisterDifferentSignature() {
+  constexpr int kSignature = 0;
+
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kSignature); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, kSignature + 1);
+      sys_errno(ret) != EPERM) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// The CPU ID is initialized.
+int TestCPU() {
+  struct rseq r = {};
+  r.cpu_id = kRseqCPUIDUninitialized;
+
+  if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  if (__atomic_load_n(&r.cpu_id, __ATOMIC_RELAXED) < 0) {
+    return 1;
+  }
+  if (__atomic_load_n(&r.cpu_id_start, __ATOMIC_RELAXED) < 0) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// Critical section is eventually aborted.
+int TestAbort() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  // Loops until abort. If this returns then abort occurred.
+  rseq_loop(&r, &cs);
+
+  return 0;
+};
+
+// Abort may be before the critical section.
+int TestAbortBefore() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_early_abort);
+
+  // Loops until abort. If this returns then abort occurred.
+  rseq_loop(&r, &cs);
+
+  return 0;
+};
+
+// Signature must match.
+int TestAbortSignature() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  // Loops until abort. This should SIGSEGV on abort.
+  rseq_loop(&r, &cs);
+
+  return 1;
+};
+
+// Abort must not be in the critical section.
+int TestAbortPreCommit() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_pre_commit);
+
+  // Loops until abort. This should SIGSEGV on abort.
+  rseq_loop(&r, &cs);
+
+  return 1;
+};
+
+// rseq.rseq_cs is cleared on abort.
+int TestAbortClearsCS() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  // Loops until abort. If this returns then abort occurred.
+  rseq_loop(&r, &cs);
+
+  if (__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) {
+    return 1;
+  }
+
+  return 0;
+};
+
+// rseq.rseq_cs is cleared on abort outside of critical section.
+int TestInvalidAbortClearsCS() {
+  struct rseq r = {};
+  if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+      sys_errno(ret) != 0) {
+    return 1;
+  }
+
+  struct rseq_cs cs = {};
+  cs.version = 0;
+  cs.flags = 0;
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+  __atomic_store_n(&r.rseq_cs, &cs, __ATOMIC_RELAXED);
+
+  // When the next abort condition occurs, the kernel will clear cs once it
+  // determines we aren't in the critical section.
+  while (1) {
+    if (!__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) {
+      break;
+    }
+  }
+
+  return 0;
+};
+
+// Exit codes:
+//  0 - Pass
+//  1 - Fail
+//  2 - Missing argument
+//  3 - Unknown test case
+extern "C" int main(int argc, char** argv, char** envp) {
+  if (argc != 2) {
+    // Usage: rseq <test case>
+    return 2;
+  }
+
+  if (strcmp(argv[1], kRseqTestUnaligned) == 0) {
+    return TestUnaligned();
+  }
+  if (strcmp(argv[1], kRseqTestRegister) == 0) {
+    return TestRegister();
+  }
+  if (strcmp(argv[1], kRseqTestDoubleRegister) == 0) {
+    return TestDoubleRegister();
+  }
+  if (strcmp(argv[1], kRseqTestRegisterUnregister) == 0) {
+    return TestRegisterUnregister();
+  }
+  if (strcmp(argv[1], kRseqTestUnregisterDifferentPtr) == 0) {
+    return TestUnregisterDifferentPtr();
+  }
+  if (strcmp(argv[1], kRseqTestUnregisterDifferentSignature) == 0) {
+    return TestUnregisterDifferentSignature();
+  }
+  if (strcmp(argv[1], kRseqTestCPU) == 0) {
+    return TestCPU();
+  }
+  if (strcmp(argv[1], kRseqTestAbort) == 0) {
+    return TestAbort();
+  }
+  if (strcmp(argv[1], kRseqTestAbortBefore) == 0) {
+    return TestAbortBefore();
+  }
+  if (strcmp(argv[1], kRseqTestAbortSignature) == 0) {
+    return TestAbortSignature();
+  }
+  if (strcmp(argv[1], kRseqTestAbortPreCommit) == 0) {
+    return TestAbortPreCommit();
+  }
+  if (strcmp(argv[1], kRseqTestAbortClearsCS) == 0) {
+    return TestAbortClearsCS();
+  }
+  if (strcmp(argv[1], kRseqTestInvalidAbortClearsCS) == 0) {
+    return TestInvalidAbortClearsCS();
+  }
+
+  return 3;
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/rseq/start.S b/test/syscalls/linux/rseq/start.S
new file mode 100644
index 000000000..b9611b276
--- /dev/null
+++ b/test/syscalls/linux/rseq/start.S
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+  .text
+  .align 4
+  .type  _start,@function
+  .globl  _start
+
+_start:
+  movq  %rsp,%rdi
+  call  __init
+  hlt
+
+  .size  _start,.-_start
+  .section  .note.GNU-stack,"",@progbits
+
+  .text
+  .globl  raw_syscall
+  .type   raw_syscall, @function
+
+raw_syscall:
+  mov  %rdi,%rax      // syscall #
+  mov  %rsi,%rdi      // arg0
+  mov  %rdx,%rsi      // arg1
+  mov  %rcx,%rdx      // arg2
+  mov  %r8,%r10       // arg3 (goes in r10 instead of rcx for system calls)
+  mov  %r9,%r8        // arg4
+  mov  0x8(%rsp),%r9  // arg5
+  syscall
+  ret
+
+  .size  raw_syscall,.-raw_syscall
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/syscalls.h b/test/syscalls/linux/rseq/syscalls.h
new file mode 100644
index 000000000..e5299c188
--- /dev/null
+++ b/test/syscalls/linux/rseq/syscalls.h
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
+
+#include "test/syscalls/linux/rseq/types.h"
+
+#ifdef __x86_64__
+// Syscall numbers.
+constexpr int kGetpid = 39;
+constexpr int kExitGroup = 231;
+#else
+#error "Unknown architecture"
+#endif
+
+namespace gvisor {
+namespace testing {
+
+// Standalone system call interfaces.
+// Note that these are all "raw" system call interfaces which encode
+// errors by setting the return value to a small negative number.
+// Use sys_errno() to check system call return values for errors.
+
+// Maximum Linux error number.
+constexpr int kMaxErrno = 4095;
+
+// Errno values.
+#define EPERM 1
+#define EFAULT 14
+#define EBUSY 16
+#define EINVAL 22
+
+// Get the error number from a raw system call return value.
+// Returns a positive error number or 0 if there was no error.
+static inline int sys_errno(uintptr_t rval) {
+  if (rval >= static_cast<uintptr_t>(-kMaxErrno)) {
+    return -static_cast<int>(rval);
+  }
+  return 0;
+}
+
+extern "C" uintptr_t raw_syscall(int number, ...);
+
+static inline void sys_exit_group(int status) {
+  raw_syscall(kExitGroup, status);
+}
+static inline int sys_getpid() {
+  return static_cast<int>(raw_syscall(kGetpid));
+}
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
diff --git a/test/syscalls/linux/rseq/test.h b/test/syscalls/linux/rseq/test.h
new file mode 100644
index 000000000..3b7bb74b1
--- /dev/null
+++ b/test/syscalls/linux/rseq/test.h
@@ -0,0 +1,43 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
+
+namespace gvisor {
+namespace testing {
+
+// Test cases supported by rseq binary.
+
+inline constexpr char kRseqTestUnaligned[] = "unaligned";
+inline constexpr char kRseqTestRegister[] = "register";
+inline constexpr char kRseqTestDoubleRegister[] = "double-register";
+inline constexpr char kRseqTestRegisterUnregister[] = "register-unregister";
+inline constexpr char kRseqTestUnregisterDifferentPtr[] =
+    "unregister-different-ptr";
+inline constexpr char kRseqTestUnregisterDifferentSignature[] =
+    "unregister-different-signature";
+inline constexpr char kRseqTestCPU[] = "cpu";
+inline constexpr char kRseqTestAbort[] = "abort";
+inline constexpr char kRseqTestAbortBefore[] = "abort-before";
+inline constexpr char kRseqTestAbortSignature[] = "abort-signature";
+inline constexpr char kRseqTestAbortPreCommit[] = "abort-precommit";
+inline constexpr char kRseqTestAbortClearsCS[] = "abort-clears-cs";
+inline constexpr char kRseqTestInvalidAbortClearsCS[] =
+    "invalid-abort-clears-cs";
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
diff --git a/test/syscalls/linux/rseq/types.h b/test/syscalls/linux/rseq/types.h
new file mode 100644
index 000000000..b6afe9817
--- /dev/null
+++ b/test/syscalls/linux/rseq/types.h
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
+
+using size_t = __SIZE_TYPE__;
+using uintptr_t = __UINTPTR_TYPE__;
+
+using uint8_t = __UINT8_TYPE__;
+using uint16_t = __UINT16_TYPE__;
+using uint32_t = __UINT32_TYPE__;
+using uint64_t = __UINT64_TYPE__;
+
+using int8_t = __INT8_TYPE__;
+using int16_t = __INT16_TYPE__;
+using int32_t = __INT32_TYPE__;
+using int64_t = __INT64_TYPE__;
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h
new file mode 100644
index 000000000..e3ff0579a
--- /dev/null
+++ b/test/syscalls/linux/rseq/uapi.h
@@ -0,0 +1,54 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
+
+// User-kernel ABI for restartable sequences.
+
+// Standard types.
+//
+// N.B. This header will be included in targets that do have the standard
+// library, so we can't shadow the standard type names.
+using __u32 = __UINT32_TYPE__;
+using __u64 = __UINT64_TYPE__;
+
+#ifdef __x86_64__
+// Syscall numbers.
+constexpr int kRseqSyscall = 334;
+#else
+#error "Unknown architecture"
+#endif  // __x86_64__
+
+struct rseq_cs {
+  __u32 version;
+  __u32 flags;
+  __u64 start_ip;
+  __u64 post_commit_offset;
+  __u64 abort_ip;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+// N.B. alignment is enforced by the kernel.
+struct rseq {
+  __u32 cpu_id_start;
+  __u32 cpu_id;
+  struct rseq_cs* rseq_cs;
+  __u32 flags;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+constexpr int kRseqFlagUnregister = 1 << 0;
+
+constexpr int kRseqCPUIDUninitialized = -1;
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
-- 
cgit v1.2.3


From cb5f9b8f863c93bb7e3757c1f4b3e1a64e6acdfb Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 9 Dec 2019 12:03:16 -0800
Subject: Mark test as non flaky.

PiperOrigin-RevId: 284606133
---
 test/syscalls/linux/BUILD | 2 --
 1 file changed, 2 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index c49445d62..6ea922fb4 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3272,8 +3272,6 @@ cc_binary(
     testonly = 1,
     srcs = ["tcp_socket.cc"],
     linkstatic = 1,
-    # FIXME(b/135470853)
-    tags = ["flaky"],
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
-- 
cgit v1.2.3


From 17867c88f7afdac6ff1c212aeac9aee2045f4f5a Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Mon, 9 Dec 2019 13:35:56 -0800
Subject: Include <netinet/tcp.h> for TCP enums in proc_net tests

These are currently duplicated in ip_socket_test_util, so tests including
both netinet/tcp.h and ip_socket_test_util won't compile.

PiperOrigin-RevId: 284623958
---
 test/syscalls/linux/ip_socket_test_util.h | 19 -------------------
 test/syscalls/linux/proc_net_tcp.cc       |  1 +
 test/syscalls/linux/proc_net_udp.cc       |  1 +
 3 files changed, 2 insertions(+), 19 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 072230d85..9cb4566db 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -26,25 +26,6 @@
 namespace gvisor {
 namespace testing {
 
-// Possible values of the "st" field in a /proc/net/{tcp,udp} entry. Source:
-// Linux kernel, include/net/tcp_states.h.
-enum {
-  TCP_ESTABLISHED = 1,
-  TCP_SYN_SENT,
-  TCP_SYN_RECV,
-  TCP_FIN_WAIT1,
-  TCP_FIN_WAIT2,
-  TCP_TIME_WAIT,
-  TCP_CLOSE,
-  TCP_CLOSE_WAIT,
-  TCP_LAST_ACK,
-  TCP_LISTEN,
-  TCP_CLOSING,
-  TCP_NEW_SYN_RECV,
-
-  TCP_MAX_STATES
-};
-
 // Extracts the IP address from an inet sockaddr in network byte order.
 uint32_t IPFromInetSockaddr(const struct sockaddr* addr);
 
diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc
index 2659f6a98..5b6e3e3cd 100644
--- a/test/syscalls/linux/proc_net_tcp.cc
+++ b/test/syscalls/linux/proc_net_tcp.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <netinet/tcp.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
diff --git a/test/syscalls/linux/proc_net_udp.cc b/test/syscalls/linux/proc_net_udp.cc
index f06f1a24b..786b4b4af 100644
--- a/test/syscalls/linux/proc_net_udp.cc
+++ b/test/syscalls/linux/proc_net_udp.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <netinet/tcp.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-- 
cgit v1.2.3


From 18af75db9de5244bd3e180a86886a4b3cadd7547 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 9 Dec 2019 15:51:24 -0800
Subject: Add UDP SO_REUSEADDR support to the port manager.

Next steps include adding support to the transport demuxer and the UDP endpoint.

PiperOrigin-RevId: 284652151
---
 pkg/tcpip/ports/BUILD                              |   2 +-
 pkg/tcpip/ports/ports.go                           | 148 +++++++--
 pkg/tcpip/ports/ports_test.go                      | 182 +++++++----
 pkg/tcpip/transport/tcp/BUILD                      |   1 +
 pkg/tcpip/transport/tcp/endpoint.go                |  25 +-
 pkg/tcpip/transport/udp/BUILD                      |   1 +
 pkg/tcpip/transport/udp/endpoint.go                |  19 +-
 .../linux/socket_bind_to_device_sequence.cc        | 353 +++++++++++++++------
 8 files changed, 536 insertions(+), 195 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index 4839f0a65..e156b01f6 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,5 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index 30cea8996..6c5e19e8f 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -41,6 +41,30 @@ type portDescriptor struct {
 	port      uint16
 }
 
+// Flags represents the type of port reservation.
+//
+// +stateify savable
+type Flags struct {
+	// MostRecent represents UDP SO_REUSEADDR.
+	MostRecent bool
+
+	// LoadBalanced indicates SO_REUSEPORT.
+	//
+	// LoadBalanced takes precidence over MostRecent.
+	LoadBalanced bool
+}
+
+func (f Flags) bits() reuseFlag {
+	var rf reuseFlag
+	if f.MostRecent {
+		rf |= mostRecentFlag
+	}
+	if f.LoadBalanced {
+		rf |= loadBalancedFlag
+	}
+	return rf
+}
+
 // PortManager manages allocating, reserving and releasing ports.
 type PortManager struct {
 	mu             sync.RWMutex
@@ -54,9 +78,59 @@ type PortManager struct {
 	hint uint32
 }
 
+type reuseFlag int
+
+const (
+	mostRecentFlag reuseFlag = 1 << iota
+	loadBalancedFlag
+	nextFlag
+
+	flagMask = nextFlag - 1
+)
+
 type portNode struct {
-	reuse bool
-	refs  int
+	// refs stores the count for each possible flag combination.
+	refs [nextFlag]int
+}
+
+func (p portNode) totalRefs() int {
+	var total int
+	for _, r := range p.refs {
+		total += r
+	}
+	return total
+}
+
+// flagRefs returns the number of references with all specified flags.
+func (p portNode) flagRefs(flags reuseFlag) int {
+	var total int
+	for i, r := range p.refs {
+		if reuseFlag(i)&flags == flags {
+			total += r
+		}
+	}
+	return total
+}
+
+// allRefsHave returns if all references have all specified flags.
+func (p portNode) allRefsHave(flags reuseFlag) bool {
+	for i, r := range p.refs {
+		if reuseFlag(i)&flags == flags && r > 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// intersectionRefs returns the set of flags shared by all references.
+func (p portNode) intersectionRefs() reuseFlag {
+	intersection := flagMask
+	for i, r := range p.refs {
+		if r > 0 {
+			intersection &= reuseFlag(i)
+		}
+	}
+	return intersection
 }
 
 // deviceNode is never empty. When it has no elements, it is removed from the
@@ -66,30 +140,44 @@ type deviceNode map[tcpip.NICID]portNode
 // isAvailable checks whether binding is possible by device. If not binding to a
 // device, check against all portNodes. If binding to a specific device, check
 // against the unspecified device and the provided device.
-func (d deviceNode) isAvailable(reuse bool, bindToDevice tcpip.NICID) bool {
+//
+// If either of the port reuse flags is enabled on any of the nodes, all nodes
+// sharing a port must share at least one reuse flag. This matches Linux's
+// behavior.
+func (d deviceNode) isAvailable(flags Flags, bindToDevice tcpip.NICID) bool {
+	flagBits := flags.bits()
 	if bindToDevice == 0 {
 		// Trying to binding all devices.
-		if !reuse {
+		if flagBits == 0 {
 			// Can't bind because the (addr,port) is already bound.
 			return false
 		}
+		intersection := flagMask
 		for _, p := range d {
-			if !p.reuse {
-				// Can't bind because the (addr,port) was previously bound without reuse.
+			i := p.intersectionRefs()
+			intersection &= i
+			if intersection&flagBits == 0 {
+				// Can't bind because the (addr,port) was
+				// previously bound without reuse.
 				return false
 			}
 		}
 		return true
 	}
 
+	intersection := flagMask
+
 	if p, ok := d[0]; ok {
-		if !reuse || !p.reuse {
+		intersection = p.intersectionRefs()
+		if intersection&flagBits == 0 {
 			return false
 		}
 	}
 
 	if p, ok := d[bindToDevice]; ok {
-		if !reuse || !p.reuse {
+		i := p.intersectionRefs()
+		intersection &= i
+		if intersection&flagBits == 0 {
 			return false
 		}
 	}
@@ -103,12 +191,12 @@ type bindAddresses map[tcpip.Address]deviceNode
 // isAvailable checks whether an IP address is available to bind to. If the
 // address is the "any" address, check all other addresses. Otherwise, just
 // check against the "any" address and the provided address.
-func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool, bindToDevice tcpip.NICID) bool {
+func (b bindAddresses) isAvailable(addr tcpip.Address, flags Flags, bindToDevice tcpip.NICID) bool {
 	if addr == anyIPAddress {
 		// If binding to the "any" address then check that there are no conflicts
 		// with all addresses.
 		for _, d := range b {
-			if !d.isAvailable(reuse, bindToDevice) {
+			if !d.isAvailable(flags, bindToDevice) {
 				return false
 			}
 		}
@@ -117,14 +205,14 @@ func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool, bindToDevice
 
 	// Check that there is no conflict with the "any" address.
 	if d, ok := b[anyIPAddress]; ok {
-		if !d.isAvailable(reuse, bindToDevice) {
+		if !d.isAvailable(flags, bindToDevice) {
 			return false
 		}
 	}
 
 	// Check that this is no conflict with the provided address.
 	if d, ok := b[addr]; ok {
-		if !d.isAvailable(reuse, bindToDevice) {
+		if !d.isAvailable(flags, bindToDevice) {
 			return false
 		}
 	}
@@ -190,17 +278,17 @@ func (s *PortManager) pickEphemeralPort(offset, count uint32, testPort func(p ui
 }
 
 // IsPortAvailable tests if the given port is available on all given protocols.
-func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
+func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	return s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice)
+	return s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice)
 }
 
-func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
+func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if addrs, ok := s.allocatedPorts[desc]; ok {
-			if !addrs.isAvailable(addr, reuse, bindToDevice) {
+			if !addrs.isAvailable(addr, flags, bindToDevice) {
 				return false
 			}
 		}
@@ -212,14 +300,14 @@ func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumb
 // reserved by another endpoint. If port is zero, ReservePort will search for
 // an unreserved ephemeral port and reserve it, returning its value in the
 // "port" return value.
-func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) {
+func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
 	// If a port is specified, just try to reserve it for all network
 	// protocols.
 	if port != 0 {
-		if !s.reserveSpecificPort(networks, transport, addr, port, reuse, bindToDevice) {
+		if !s.reserveSpecificPort(networks, transport, addr, port, flags, bindToDevice) {
 			return 0, tcpip.ErrPortInUse
 		}
 		return port, nil
@@ -227,15 +315,16 @@ func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transp
 
 	// A port wasn't specified, so try to find one.
 	return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
-		return s.reserveSpecificPort(networks, transport, addr, p, reuse, bindToDevice), nil
+		return s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice), nil
 	})
 }
 
 // reserveSpecificPort tries to reserve the given port on all given protocols.
-func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
-	if !s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice) {
+func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
+	if !s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice) {
 		return false
 	}
+	flagBits := flags.bits()
 
 	// Reserve port on all network protocols.
 	for _, network := range networks {
@@ -250,12 +339,9 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 			d = make(deviceNode)
 			m[addr] = d
 		}
-		if n, ok := d[bindToDevice]; ok {
-			n.refs++
-			d[bindToDevice] = n
-		} else {
-			d[bindToDevice] = portNode{reuse: reuse, refs: 1}
-		}
+		n := d[bindToDevice]
+		n.refs[flagBits]++
+		d[bindToDevice] = n
 	}
 
 	return true
@@ -263,10 +349,12 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 
 // ReleasePort releases the reservation on a port/IP combination so that it can
 // be reserved by other endpoints.
-func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, bindToDevice tcpip.NICID) {
+func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
+	flagBits := flags.bits()
+
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if m, ok := s.allocatedPorts[desc]; ok {
@@ -278,9 +366,9 @@ func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transp
 			if !ok {
 				continue
 			}
-			n.refs--
+			n.refs[flagBits]--
 			d[bindToDevice] = n
-			if n.refs == 0 {
+			if n.refs == [nextFlag]int{} {
 				delete(d, bindToDevice)
 			}
 			if len(d) == 0 {
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 19f4833fc..d6969d050 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -33,7 +33,7 @@ type portReserveTestAction struct {
 	port    uint16
 	ip      tcpip.Address
 	want    *tcpip.Error
-	reuse   bool
+	flags   Flags
 	release bool
 	device  tcpip.NICID
 }
@@ -50,7 +50,7 @@ func TestPortReservation(t *testing.T) {
 				{port: 80, ip: fakeIPAddress1, want: nil},
 				/* N.B. Order of tests matters! */
 				{port: 80, ip: anyIPAddress, want: tcpip.ErrPortInUse},
-				{port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true},
+				{port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}},
 			},
 		},
 		{
@@ -61,7 +61,7 @@ func TestPortReservation(t *testing.T) {
 				/* release fakeIPAddress, but anyIPAddress is still inuse */
 				{port: 22, ip: fakeIPAddress, release: true},
 				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
-				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true},
+				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}},
 				/* Release port 22 from any IP address, then try to reserve fake IP address on 22 */
 				{port: 22, ip: anyIPAddress, want: nil, release: true},
 				{port: 22, ip: fakeIPAddress, want: nil},
@@ -71,36 +71,36 @@ func TestPortReservation(t *testing.T) {
 			actions: []portReserveTestAction{
 				{port: 00, ip: fakeIPAddress, want: nil},
 				{port: 00, ip: fakeIPAddress, want: nil},
-				{port: 00, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 00, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 			},
 		}, {
 			tname: "bind to ip with reuseport",
 			actions: []portReserveTestAction{
-				{port: 25, ip: fakeIPAddress, reuse: true, want: nil},
-				{port: 25, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 
-				{port: 25, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse},
-				{port: 25, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 25, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 25, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
 
-				{port: 25, ip: anyIPAddress, reuse: true, want: nil},
+				{port: 25, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 			},
 		}, {
 			tname: "bind to inaddr any with reuseport",
 			actions: []portReserveTestAction{
-				{port: 24, ip: anyIPAddress, reuse: true, want: nil},
-				{port: 24, ip: anyIPAddress, reuse: true, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 
-				{port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
 
-				{port: 24, ip: fakeIPAddress, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, release: true, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, release: true, want: nil},
 
-				{port: 24, ip: anyIPAddress, release: true},
-				{port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true},
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
 
-				{port: 24, ip: anyIPAddress, release: true},
-				{port: 24, ip: anyIPAddress, reuse: false, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true},
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: nil},
 			},
 		}, {
 			tname: "bind twice with device fails",
@@ -125,88 +125,152 @@ func TestPortReservation(t *testing.T) {
 			actions: []portReserveTestAction{
 				{port: 24, ip: fakeIPAddress, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
 			tname: "bind with device",
 			actions: []portReserveTestAction{
 				{port: 24, ip: fakeIPAddress, device: 123, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 789, want: nil},
 				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
-			tname: "bind with reuse",
+			tname: "bind with reuseport",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
 			},
 		}, {
-			tname: "binding with reuse and device",
+			tname: "binding with reuseport and device",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 999, want: tcpip.ErrPortInUse},
 			},
 		}, {
-			tname: "mixing reuse and not reuse by binding to device",
+			tname: "mixing reuseport and not reuseport by binding to device",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 456, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 999, want: nil},
 			},
 		}, {
-			tname: "can't bind to 0 after mixing reuse and not reuse",
+			tname: "can't bind to 0 after mixing reuseport and not reuseport",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, device: 456, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
 			tname: "bind and release",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: tcpip.ErrPortInUse},
-				{port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
 
 				// Release the bind to device 0 and try again.
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil, release: true},
-				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil},
 			},
 		}, {
-			tname: "bind twice with reuse once",
+			tname: "bind twice with reuseport once",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
 			},
 		}, {
 			tname: "release an unreserved device",
 			actions: []portReserveTestAction{
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil},
 				// The below don't exist.
-				{port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil, release: true},
-				{port: 9999, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil, release: true},
+				{port: 9999, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true},
 				// Release all.
-				{port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true},
-				{port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil, release: true},
+			},
+		}, {
+			tname: "bind with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: nil},
+			},
+		}, {
+			tname: "bind twice with reuseaddr once",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport, and then reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport, and then reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport twice, and then reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport twice, and then reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr, and then reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseport, and then reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
 			},
 		},
 	} {
@@ -216,12 +280,12 @@ func TestPortReservation(t *testing.T) {
 
 			for _, test := range test.actions {
 				if test.release {
-					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.device)
+					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device)
 					continue
 				}
-				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.reuse, test.device)
+				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device)
 				if err != test.want {
-					t.Fatalf("ReservePort(.., .., %s, %d, %t, %d) = %v, want %v", test.ip, test.port, test.reuse, test.device, err, test.want)
+					t.Fatalf("ReservePort(.., .., %s, %d, %+v, %d) = %v, want %v", test.ip, test.port, test.flags, test.device, err, test.want)
 				}
 				if test.port == 0 && (gotPort == 0 || gotPort < FirstEphemeral) {
 					t.Fatalf("ReservePort(.., .., .., 0) = %d, want port number >= %d to be picked", gotPort, FirstEphemeral)
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index dd1728f9c..455a1c098 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -52,6 +52,7 @@ go_library(
         "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
+        "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9d4a87e30..4861ab513 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -30,6 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tmutex"
@@ -343,6 +344,7 @@ type endpoint struct {
 	// Values used to reserve a port or register a transport endpoint
 	// (which ever happens first).
 	boundBindToDevice tcpip.NICID
+	boundPortFlags    ports.Flags
 
 	// effectiveNetProtos contains the network protocols actually in use. In
 	// most cases it will only contain "netProto", but in cases like IPv6
@@ -737,9 +739,10 @@ func (e *endpoint) Close() {
 			e.isRegistered = false
 		}
 
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.isPortReserved = false
 		e.boundBindToDevice = 0
+		e.boundPortFlags = ports.Flags{}
 	}
 
 	// Mark endpoint as closed.
@@ -800,10 +803,11 @@ func (e *endpoint) cleanupLocked() {
 	}
 
 	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.isPortReserved = false
 	}
 	e.boundBindToDevice = 0
+	e.boundPortFlags = ports.Flags{}
 
 	e.route.Release()
 	e.stack.CompleteTransportEndpointCleanup(e)
@@ -1775,7 +1779,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 			}
 			// reusePort is false below because connect cannot reuse a port even if
 			// reusePort was set.
-			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, false /* reusePort */, e.bindToDevice) {
+			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, ports.Flags{LoadBalanced: false}, e.bindToDevice) {
 				return false, nil
 			}
 
@@ -1802,7 +1806,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	// before Connect: in such a case we don't want to hold on to
 	// reservations anymore.
 	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.isPortReserved = false
 	}
 
@@ -2034,28 +2038,33 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 		}
 	}
 
-	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice)
+	flags := ports.Flags{
+		LoadBalanced: e.reusePort,
+	}
+	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, flags, e.bindToDevice)
 	if err != nil {
 		return err
 	}
 
 	e.boundBindToDevice = e.bindToDevice
+	e.boundPortFlags = flags
 	e.isPortReserved = true
 	e.effectiveNetProtos = netProtos
 	e.ID.LocalPort = port
 
 	// Any failures beyond this point must remove the port registration.
-	defer func(bindToDevice tcpip.NICID) {
+	defer func(portFlags ports.Flags, bindToDevice tcpip.NICID) {
 		if err != nil {
-			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice)
+			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, portFlags, bindToDevice)
 			e.isPortReserved = false
 			e.effectiveNetProtos = nil
 			e.ID.LocalPort = 0
 			e.ID.LocalAddress = ""
 			e.boundNICID = 0
 			e.boundBindToDevice = 0
+			e.boundPortFlags = ports.Flags{}
 		}
-	}(e.boundBindToDevice)
+	}(e.boundPortFlags, e.boundBindToDevice)
 
 	// If an address is specified, we must ensure that it's one of our
 	// local addresses.
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 8d4c3808f..97e4d5825 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -34,6 +34,7 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/iptables",
+        "//pkg/tcpip/ports",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
         "//pkg/waiter",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 4b161e404..1ac4705af 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -107,6 +108,7 @@ type endpoint struct {
 	// Values used to reserve a port or register a transport endpoint.
 	// (which ever happens first).
 	boundBindToDevice tcpip.NICID
+	boundPortFlags    ports.Flags
 
 	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
 	// applied while sending packets. Defaults to 0 as on Linux.
@@ -180,8 +182,9 @@ func (e *endpoint) Close() {
 	switch e.state {
 	case StateBound, StateConnected:
 		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.boundBindToDevice = 0
+		e.boundPortFlags = ports.Flags{}
 	}
 
 	for _, mem := range e.multicastMemberships {
@@ -895,7 +898,8 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 	} else {
 		if e.ID.LocalPort != 0 {
 			// Release the ephemeral port.
-			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
+			e.boundPortFlags = ports.Flags{}
 		}
 		e.state = StateInitial
 	}
@@ -1042,16 +1046,23 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 
 func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) {
 	if e.ID.LocalPort == 0 {
-		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.reusePort, e.bindToDevice)
+		flags := ports.Flags{
+			LoadBalanced: e.reusePort,
+			// FIXME(b/129164367): Support SO_REUSEADDR.
+			MostRecent: false,
+		}
+		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, flags, e.bindToDevice)
 		if err != nil {
 			return id, e.bindToDevice, err
 		}
+		e.boundPortFlags = flags
 		id.LocalPort = port
 	}
 
 	err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice)
 	if err != nil {
-		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.bindToDevice)
+		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.boundPortFlags, e.bindToDevice)
+		e.boundPortFlags = ports.Flags{}
 	}
 	return id, e.bindToDevice, err
 }
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index e4641c62e..033fd80a5 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -97,12 +97,12 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
     sockets_to_close_.erase(socket_id);
   }
 
-  // Bind a socket with the reuse option and bind_to_device options.  Checks
+  // Bind a socket with the reuse options and bind_to_device options. Checks
   // that all steps succeed and that the bind command's error matches want.
   // Sets the socket_id to uniquely identify the socket bound if it is not
   // nullptr.
-  void BindSocket(bool reuse, int device_id = 0, int want = 0,
-                  int *socket_id = nullptr) {
+  void BindSocket(bool reuse_port, bool reuse_addr, int device_id = 0,
+                  int want = 0, int *socket_id = nullptr) {
     next_socket_id_++;
     sockets_to_close_[next_socket_id_] = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
     auto socket_fd = sockets_to_close_[next_socket_id_]->get();
@@ -110,13 +110,20 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
       *socket_id = next_socket_id_;
     }
 
-    // If reuse is indicated, do that.
-    if (reuse) {
+    // If reuse_port is indicated, do that.
+    if (reuse_port) {
       EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
                              sizeof(kSockOptOn)),
                   SyscallSucceedsWithValue(0));
     }
 
+    // If reuse_addr is indicated, do that.
+    if (reuse_addr) {
+      EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                             sizeof(kSockOptOn)),
+                  SyscallSucceedsWithValue(0));
+    }
+
     // If the device is non-zero, bind to that device.
     if (device_id != 0) {
       string device_name;
@@ -182,129 +189,289 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
 };
 
 TEST_P(BindToDeviceSequenceTest, BindTwiceWithDeviceFails) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 3));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 3, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 3, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindToDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 1));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 2));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 1));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 2));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindToDeviceAndThenWithoutDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindWithoutDevice) {
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ false));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindWithDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 456, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 789, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 789, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindWithReuse) {
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
   ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true, /* bind_to_device */ 0));
+      BindSocket(/* reusePort */ true, /* reuse_addr */ false));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false,
+      /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindingWithReuseAndDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 456));
-  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 789));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 999, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 456));
+  ASSERT_NO_FATAL_FAILURE(
+      BindSocket(/* reuse_port */ true, /* reuse_addr */ false));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 999, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, MixingReuseAndNotReuseByBindingToDevice) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 456, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 789, 0));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 999, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 789, 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 999, 0));
 }
 
 TEST_P(BindToDeviceSequenceTest, CannotBindTo0AfterMixingReuseAndNotReuse) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 456));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindAndRelease) {
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
   int to_release;
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, 0, &to_release));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 345, EADDRINUSE));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 789));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, 0, &to_release));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 345, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789));
   // Release the bind to device 0 and try again.
   ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 345));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 345));
 }
 
 TEST_P(BindToDeviceSequenceTest, BindTwiceWithReuseOnce) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindWithReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
   ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ false, /* bind_to_device */ 123));
-  ASSERT_NO_FATAL_FAILURE(
-      BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+      BindSocket(/* reusePort */ false, /* reuse_addr */ true));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 123, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+}
+
+TEST_P(BindToDeviceSequenceTest,
+       CannotBindTo0AfterMixingReuseAddrAndNotReuseAddr) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 456));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReusePort) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReusePort) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReusePortThenReuseAddrReusePort) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrThenReuseAddr) {
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0, EADDRINUSE));
+}
+
+// This behavior seems like a bug?
+TEST_P(BindToDeviceSequenceTest,
+       BindReuseAddrThenReuseAddrReusePortThenReuseAddr) {
+  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  SKIP_IF(IsRunningOnGvisor());
+
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ true,
+                                     /* bind_to_device */ 0));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 0));
 }
 
 INSTANTIATE_TEST_SUITE_P(BindToDeviceTest, BindToDeviceSequenceTest,
-- 
cgit v1.2.3


From 98aafb1334b816596b462ad12fa3e96784703061 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 9 Dec 2019 20:07:14 -0800
Subject: Add test for SO_BINDTODEVICE state bug.

This was accidentally dropped from the change which fixed the bug.

Updates #1217

PiperOrigin-RevId: 284689362
---
 .../linux/socket_bind_to_device_sequence.cc        | 29 ++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index 033fd80a5..34b1058a9 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -97,6 +97,16 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
     sockets_to_close_.erase(socket_id);
   }
 
+  // SetDevice changes the bind_to_device option. It does not bind or re-bind.
+  void SetDevice(int socket_id, int device_id) {
+    auto socket_fd = sockets_to_close_[socket_id]->get();
+    string device_name;
+    ASSERT_NO_FATAL_FAILURE(GetDevice(device_id, &device_name));
+    EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE,
+                           device_name.c_str(), device_name.size() + 1),
+                SyscallSucceedsWithValue(0));
+  }
+
   // Bind a socket with the reuse options and bind_to_device options. Checks
   // that all steps succeed and that the bind command's error matches want.
   // Sets the socket_id to uniquely identify the socket bound if it is not
@@ -474,6 +484,25 @@ TEST_P(BindToDeviceSequenceTest,
                                      /* bind_to_device */ 0));
 }
 
+// Repro test for gvisor.dev/issue/1217. Not replicated in ports_test.go as this
+// test is different from the others and wouldn't fit well there.
+TEST_P(BindToDeviceSequenceTest, BindAndReleaseDifferentDevice) {
+  int to_release;
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 3, 0, &to_release));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+                                     /* reuse_addr */ false,
+                                     /* bind_to_device */ 3, EADDRINUSE));
+  // Change the device. Since the socket was already bound, this should have no
+  // effect.
+  SetDevice(to_release, 2);
+  // Release the bind to device 3 and try again.
+  ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release));
+  ASSERT_NO_FATAL_FAILURE(BindSocket(
+      /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3));
+}
+
 INSTANTIATE_TEST_SUITE_P(BindToDeviceTest, BindToDeviceSequenceTest,
                          ::testing::Values(IPv4UDPUnboundSocket(0),
                                            IPv4TCPUnboundSocket(0)));
-- 
cgit v1.2.3


From 4a19ebd431659578c9af0a91ff35d8b6d9de190e Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 10 Dec 2019 09:32:47 -0800
Subject: Add hostinet tests for sendmsg and recvmsg with TOS/TCLASS.

PiperOrigin-RevId: 284786069
---
 test/syscalls/linux/udp_socket_test_cases.cc | 149 +++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 4556f16d6..dc35c2f50 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1345,5 +1345,154 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
   ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
 }
 
+// Test that a socket with IP_TOS or IPV6_TCLASS set will set the TOS byte on
+// outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SetAndReceiveTOS) {
+  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  int recv_level = SOL_IP;
+  int recv_type = IP_RECVTOS;
+  if (GetParam() != AddressFamily::kIpv4) {
+    recv_level = SOL_IPV6;
+    recv_type = IPV6_RECVTCLASS;
+  }
+  ASSERT_THAT(
+      setsockopt(s_, recv_level, recv_type, &kSockOptOn, sizeof(kSockOptOn)),
+      SyscallSucceeds());
+
+  // Set socket TOS.
+  int sent_level = recv_level;
+  int sent_type = IP_TOS;
+  if (sent_level == SOL_IPV6) {
+    sent_type = IPV6_TCLASS;
+  }
+  int sent_tos = IPTOS_LOWDELAY;  // Choose some TOS value.
+  ASSERT_THAT(
+      setsockopt(t_, sent_level, sent_type, &sent_tos, sizeof(sent_tos)),
+      SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  struct msghdr sent_msg = {};
+  struct iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = &sent_data[0];
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  // Receive message.
+  struct msghdr received_msg = {};
+  struct iovec received_iov = {};
+  char received_data[kDataLength];
+  received_iov.iov_base = &received_data[0];
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  size_t cmsg_data_len = sizeof(int8_t);
+  if (sent_type == IPV6_TCLASS) {
+    cmsg_data_len = sizeof(int);
+  }
+  std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  received_msg.msg_control = &received_cmsgbuf[0];
+  received_msg.msg_controllen = received_cmsgbuf.size();
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, sent_level);
+  EXPECT_EQ(cmsg->cmsg_type, sent_type);
+  int8_t received_tos = 0;
+  memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+  EXPECT_EQ(received_tos, sent_tos);
+}
+
+// Test that sendmsg with IP_TOS and IPV6_TCLASS control messages will set the
+// TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SendAndReceiveTOS) {
+  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  int recv_level = SOL_IP;
+  int recv_type = IP_RECVTOS;
+  if (GetParam() != AddressFamily::kIpv4) {
+    recv_level = SOL_IPV6;
+    recv_type = IPV6_RECVTCLASS;
+  }
+  int recv_opt = kSockOptOn;
+  ASSERT_THAT(
+      setsockopt(s_, recv_level, recv_type, &recv_opt, sizeof(recv_opt)),
+      SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  int sent_level = recv_level;
+  int sent_type = IP_TOS;
+  int sent_tos = IPTOS_LOWDELAY;  // Choose some TOS value.
+
+  struct msghdr sent_msg = {};
+  struct iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = &sent_data[0];
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+  size_t cmsg_data_len = sizeof(int8_t);
+  if (sent_level == SOL_IPV6) {
+    sent_type = IPV6_TCLASS;
+    cmsg_data_len = sizeof(int);
+  }
+  std::vector<char> sent_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  sent_msg.msg_control = &sent_cmsgbuf[0];
+  sent_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+
+  // Manually add control message.
+  struct cmsghdr* sent_cmsg = CMSG_FIRSTHDR(&sent_msg);
+  sent_cmsg->cmsg_len = CMSG_LEN(cmsg_data_len);
+  sent_cmsg->cmsg_level = sent_level;
+  sent_cmsg->cmsg_type = sent_type;
+  *(int8_t*)CMSG_DATA(sent_cmsg) = sent_tos;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  // Receive message.
+  struct msghdr received_msg = {};
+  struct iovec received_iov = {};
+  char received_data[kDataLength];
+  received_iov.iov_base = &received_data[0];
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+  received_msg.msg_control = &received_cmsgbuf[0];
+  received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, sent_level);
+  EXPECT_EQ(cmsg->cmsg_type, sent_type);
+  int8_t received_tos = 0;
+  memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+  EXPECT_EQ(received_tos, sent_tos);
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From aadbf322c63b0aa1d34cd9755dc1266af2e5ac58 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 10 Dec 2019 09:36:52 -0800
Subject: Disable execveat test that is causing files in /bin to be deleted.

Disable until gvisor.dev/issue/1366 is resolved.

Updates #1366

PiperOrigin-RevId: 284786895
---
 test/syscalls/linux/exec.cc | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index b5e0a512b..e402d5b27 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -696,15 +696,6 @@ TEST(ExecveatTest, SymlinkNoFollowAndEmptyPath) {
                 ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
 }
 
-TEST(ExecveatTest, SymlinkNoFollowIgnoreSymlinkAncestor) {
-  TempPath parent_link =
-      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateSymlinkTo("/tmp", "/bin"));
-  std::string path_with_symlink = JoinPath(parent_link.path(), "echo");
-
-  CheckExecveat(AT_FDCWD, path_with_symlink, {path_with_symlink}, {},
-                AT_SYMLINK_NOFOLLOW, ArgEnvExitStatus(0, 0), "");
-}
-
 TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open("/bin", O_DIRECTORY));
-- 
cgit v1.2.3


From 769e1cdcbe539ca2347ad5ccd2706ae17777aed9 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 10 Dec 2019 11:40:29 -0800
Subject: Re-enable execveat test that was causing files in /bin to be deleted.

Test now no longer deletes files incorrectly, due to a fix in fs utils
used by TempPath (github.com/google/gvisor/pull/1368).

Fixes #1366

PiperOrigin-RevId: 284814605
---
 test/syscalls/linux/exec.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index e402d5b27..b5e0a512b 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -696,6 +696,15 @@ TEST(ExecveatTest, SymlinkNoFollowAndEmptyPath) {
                 ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
 }
 
+TEST(ExecveatTest, SymlinkNoFollowIgnoreSymlinkAncestor) {
+  TempPath parent_link =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateSymlinkTo("/tmp", "/bin"));
+  std::string path_with_symlink = JoinPath(parent_link.path(), "echo");
+
+  CheckExecveat(AT_FDCWD, path_with_symlink, {path_with_symlink}, {},
+                AT_SYMLINK_NOFOLLOW, ArgEnvExitStatus(0, 0), "");
+}
+
 TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
   const FileDescriptor dirfd =
       ASSERT_NO_ERRNO_AND_VALUE(Open("/bin", O_DIRECTORY));
-- 
cgit v1.2.3


From 1601e78a52e9181d1ea8a3ff36399575e95ad0bf Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 11 Dec 2019 16:39:58 -0800
Subject: Add syscall tests for getxattr and setxattr.

Support for getxattr and setxattr are in subsequent commits.

PiperOrigin-RevId: 285088817
---
 test/syscalls/BUILD          |   5 +
 test/syscalls/linux/BUILD    |  21 ++
 test/syscalls/linux/xattr.cc | 491 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 517 insertions(+)
 create mode 100644 test/syscalls/linux/xattr.cc

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 829693e8e..a3a85917d 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -717,6 +717,11 @@ syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
 
 syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
 
+syscall_test(
+    add_overlay = True,
+    test = "//test/syscalls/linux:xattr_test",
+)
+
 go_binary(
     name = "syscall_test_runner",
     testonly = 1,
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 6ea922fb4..0bbaaf28a 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3722,3 +3722,24 @@ cc_binary(
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_binary(
+    name = "xattr_test",
+    testonly = 1,
+    srcs = [
+        "file_base.h",
+        "xattr.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
new file mode 100644
index 000000000..3e07b634b
--- /dev/null
+++ b/test/syscalls/linux/xattr.cc
@@ -0,0 +1,491 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/capability_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class XattrTest : public FileTest {};
+
+TEST_F(XattrTest, XattrNullName) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+
+  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, XattrEmptyName) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+
+  EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(getxattr(path, "", nullptr, 0), SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, XattrLargeName) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  std::string name = "user.";
+  name += std::string(XATTR_NAME_MAX - name.length(), 'a');
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+              SyscallSucceedsWithValue(0));
+
+  name += "a";
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+              SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, XattrInvalidPrefix) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  std::string name(XATTR_NAME_MAX, 'a');
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_F(XattrTest, XattrReadOnly) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IRUSR));
+
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+              SyscallFailsWithErrno(EACCES));
+
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, XattrWriteOnly) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  // Drop capabilities that allow us to override file and directory permissions.
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+  ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IWUSR));
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
+}
+
+TEST_F(XattrTest, XattrTrustedWithNonadmin) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+  SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  const char* path = test_file_name_.c_str();
+  const char name[] = "trusted.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, XattrOnDirectory) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, XattrOnSymlink) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, XattrOnInvalidFileTypes) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  char name[] = "user.abc";
+
+  char char_device[] = "/dev/zero";
+  EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(getxattr(char_device, name, NULL, 0),
+              SyscallFailsWithErrno(ENODATA));
+
+  // Use tmpfs, where creation of named pipes is supported.
+  const std::string fifo = NewTempAbsPathInDir("/dev/shm");
+  const char* path = fifo.c_str();
+  EXPECT_THAT(mknod(path, S_IFIFO | S_IRUSR | S_IWUSR, 0), SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, NULL, 0, /*flags=*/0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(getxattr(path, name, NULL, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  size_t size = 1;
+  EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  std::vector<char> expected_buf = {'a', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
+
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, XATTR_SIZE_MAX),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, SetxattrSizeTooLarge) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+
+  // Note that each particular fs implementation may stipulate a lower size
+  // limit, in which case we actually may fail (e.g. error with ENOSPC) for
+  // some sizes under XATTR_SIZE_MAX.
+  size_t size = XATTR_SIZE_MAX + 1;
+  std::vector<char> val(size);
+  EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+              SyscallFailsWithErrno(E2BIG));
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
+              SyscallFailsWithErrno(EFAULT));
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val(XATTR_SIZE_MAX + 1);
+  std::fill(val.begin(), val.end(), 'a');
+  size_t size = 1;
+  EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  std::vector<char> expected_buf = {'a', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), size),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  std::vector<char> expected_buf = {'a', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrReplaceWithLarger) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf = {'-', '-'};
+  EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(2));
+  EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, SetxattrCreateFlag) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
+              SyscallFailsWithErrno(EEXIST));
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrReplaceFlag) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
+              SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
+              SyscallSucceeds());
+
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrInvalidFlags) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  int invalid_flags = 0xff;
+  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(XattrTest, Getxattr) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  int val = 1234;
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  int buf = 0;
+  EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  std::vector<char> val = {'a', 'a'};
+  size_t size = val.size();
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, 1), SyscallFailsWithErrno(ERANGE));
+  EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, 1, /*flags=*/0), SyscallSucceeds());
+
+  std::vector<char> buf(XATTR_SIZE_MAX);
+  std::fill(buf.begin(), buf.end(), '-');
+  std::vector<char> expected_buf = buf;
+  expected_buf[0] = 'a';
+  EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+              SyscallSucceedsWithValue(1));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, GetxattrZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
+              SyscallSucceeds());
+
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, 0),
+              SyscallSucceedsWithValue(sizeof(val)));
+  EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, GetxattrSizeTooLarge) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
+              SyscallSucceeds());
+
+  std::vector<char> buf(XATTR_SIZE_MAX + 1);
+  std::fill(buf.begin(), buf.end(), '-');
+  std::vector<char> expected_buf = buf;
+  expected_buf[0] = 'a';
+  EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+              SyscallSucceedsWithValue(sizeof(val)));
+  EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, GetxattrNullValue) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(getxattr(path, name, nullptr, size),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
+  // TODO(b/127675828): Support setxattr and getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  char name[] = "user.abc";
+  char val = 'a';
+  size_t size = sizeof(val);
+  // Set value with zero size.
+  EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
+  // Get value with nonzero size.
+  EXPECT_THAT(getxattr(path, name, nullptr, size), SyscallSucceedsWithValue(0));
+
+  // Set value with nonzero size.
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+  // Get value with zero size.
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(size));
+}
+
+TEST_F(XattrTest, GetxattrNonexistentName) {
+  // TODO(b/127675828): Support getxattr.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const char* path = test_file_name_.c_str();
+  std::string name = "user.nonexistent";
+  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+              SyscallFailsWithErrno(ENODATA));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 6fc9f0aefd89ce42ef2c38ea7853f9ba7c4bee04 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 11 Dec 2019 17:51:37 -0800
Subject: Add support for TCP_USER_TIMEOUT option.

The implementation follows the linux behavior where specifying
a TCP_USER_TIMEOUT will cause the resend timer to honor the
user specified timeout rather than the default rto based timeout.

Further it alters when connections are timedout due to keepalive
failures. It does not alter the behavior of when keepalives are
sent. This is as per the linux behavior.

PiperOrigin-RevId: 285099795
---
 pkg/sentry/socket/netstack/netstack.go       |  23 ++++
 pkg/tcpip/tcpip.go                           |   5 +
 pkg/tcpip/transport/tcp/BUILD                |   1 +
 pkg/tcpip/transport/tcp/accept.go            |  15 +++
 pkg/tcpip/transport/tcp/connect.go           |  19 ++-
 pkg/tcpip/transport/tcp/endpoint.go          |  19 +++
 pkg/tcpip/transport/tcp/protocol.go          |  21 ++-
 pkg/tcpip/transport/tcp/rcv.go               |  19 ++-
 pkg/tcpip/transport/tcp/rcv_state.go         |  29 ++++
 pkg/tcpip/transport/tcp/snd.go               |  48 ++++++-
 pkg/tcpip/transport/tcp/snd_state.go         |  10 ++
 pkg/tcpip/transport/tcp/tcp_test.go          | 194 ++++++++++++++++++++++++---
 test/syscalls/linux/socket_inet_loopback.cc  |  56 +++++++-
 test/syscalls/linux/socket_ip_tcp_generic.cc |  63 +++++++++
 test/syscalls/linux/tcp_socket.cc            |  25 ++++
 15 files changed, 509 insertions(+), 38 deletions(-)
 create mode 100644 pkg/tcpip/transport/tcp/rcv_state.go

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index fe5a46aa3..8a6522eac 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1127,6 +1127,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(time.Duration(v) / time.Second), nil
 
+	case linux.TCP_USER_TIMEOUT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.TCPUserTimeoutOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Millisecond), nil
+
 	case linux.TCP_INFO:
 		var v tcpip.TCPInfoOption
 		if err := ep.GetSockOpt(&v); err != nil {
@@ -1563,6 +1575,17 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_USER_TIMEOUT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := int32(usermem.ByteOrder.Uint32(optVal))
+		if v < 0 {
+			return syserr.ErrInvalidArgument
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))))
+
 	case linux.TCP_CONGESTION:
 		v := tcpip.CongestionControlOption(optVal)
 		if err := ep.SetSockOpt(v); err != nil {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index d5bb5b6ed..f62fd729f 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -576,6 +576,11 @@ type KeepaliveIntervalOption time.Duration
 // closed.
 type KeepaliveCountOption int
 
+// TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
+// specified timeout for a given TCP connection.
+// See: RFC5482 for details.
+type TCPUserTimeoutOption time.Duration
+
 // CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
 // the current congestion control algorithm.
 type CongestionControlOption string
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 455a1c098..3b353d56c 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -28,6 +28,7 @@ go_library(
         "forwarder.go",
         "protocol.go",
         "rcv.go",
+        "rcv_state.go",
         "reno.go",
         "sack.go",
         "sack_scoreboard.go",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 74df3edfb..5422ae80c 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -242,6 +242,13 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 
 	n.initGSO()
 
+	// Now inherit any socket options that should be inherited from the
+	// listening endpoint.
+	// In case of Forwarder listenEP will be nil and hence this check.
+	if l.listenEP != nil {
+		l.listenEP.propagateInheritableOptions(n)
+	}
+
 	// Register new endpoint so that packets are routed to it.
 	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
 		n.Close()
@@ -350,6 +357,14 @@ func (e *endpoint) deliverAccepted(n *endpoint) {
 	}
 }
 
+// propagateInheritableOptions propagates any options set on the listening
+// endpoint to the newly created endpoint.
+func (e *endpoint) propagateInheritableOptions(n *endpoint) {
+	e.mu.Lock()
+	n.userTimeout = e.userTimeout
+	e.mu.Unlock()
+}
+
 // handleSynSegment is called in its own goroutine once the listening endpoint
 // receives a SYN segment. It is responsible for completing the handshake and
 // queueing the new endpoint for acceptance.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 3d059c302..4c34fc9d2 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -862,7 +862,7 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	}
 	e.state = StateError
 	e.HardError = err
-	if err != tcpip.ErrConnectionReset {
+	if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout {
 		// The exact sequence number to be used for the RST is the same as the
 		// one used by Linux. We need to handle the case of window being shrunk
 		// which can cause sndNxt to be outside the acceptable window on the
@@ -1087,12 +1087,24 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 // keepalive packets periodically when the connection is idle. If we don't hear
 // from the other side after a number of tries, we terminate the connection.
 func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
+	e.mu.RLock()
+	userTimeout := e.userTimeout
+	e.mu.RUnlock()
+
 	e.keepalive.Lock()
 	if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
 		e.keepalive.Unlock()
 		return nil
 	}
 
+	// If a userTimeout is set then abort the connection if it is
+	// exceeded.
+	if userTimeout != 0 && time.Since(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 {
+		e.keepalive.Unlock()
+		e.stack.Stats().TCP.EstablishedTimedout.Increment()
+		return tcpip.ErrTimeout
+	}
+
 	if e.keepalive.unacked >= e.keepalive.count {
 		e.keepalive.Unlock()
 		e.stack.Stats().TCP.EstablishedTimedout.Increment()
@@ -1112,7 +1124,6 @@ func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
 // whether it is enabled for this endpoint.
 func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
 	e.keepalive.Lock()
-	defer e.keepalive.Unlock()
 	if receivedData {
 		e.keepalive.unacked = 0
 	}
@@ -1120,6 +1131,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
 	// data to send.
 	if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt {
 		e.keepalive.timer.disable()
+		e.keepalive.Unlock()
 		return
 	}
 	if e.keepalive.unacked > 0 {
@@ -1127,6 +1139,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
 	} else {
 		e.keepalive.timer.enable(e.keepalive.idle)
 	}
+	e.keepalive.Unlock()
 }
 
 // disableKeepaliveTimer stops the keepalive timer.
@@ -1239,6 +1252,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			w: &e.snd.resendWaker,
 			f: func() *tcpip.Error {
 				if !e.snd.retransmitTimerExpired() {
+					e.stack.Stats().TCP.EstablishedTimedout.Increment()
 					return tcpip.ErrTimeout
 				}
 				return nil
@@ -1405,6 +1419,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		if s == nil {
 			break
 		}
+
 		e.tryDeliverSegmentFromClosedEndpoint(s)
 	}
 
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 4861ab513..dd8b47cbe 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -341,6 +341,7 @@ type endpoint struct {
 	// TCP should never broadcast but Linux nevertheless supports enabling/
 	// disabling SO_BROADCAST, albeit as a NOOP.
 	broadcast bool
+
 	// Values used to reserve a port or register a transport endpoint
 	// (which ever happens first).
 	boundBindToDevice tcpip.NICID
@@ -474,6 +475,12 @@ type endpoint struct {
 	// without hearing a response, the connection is closed.
 	keepalive keepalive
 
+	// userTimeout if non-zero specifies a user specified timeout for
+	// a connection w/ pending data to send. A connection that has pending
+	// unacked data will be forcibily aborted if the timeout is reached
+	// without any data being acked.
+	userTimeout time.Duration
+
 	// pendingAccepted is a synchronization primitive used to track number
 	// of connections that are queued up to be delivered to the accepted
 	// channel. We use this to ensure that all goroutines blocked on writing
@@ -1333,6 +1340,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 		return nil
 
+	case tcpip.TCPUserTimeoutOption:
+		e.mu.Lock()
+		e.userTimeout = time.Duration(v)
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.BroadcastOption:
 		e.mu.Lock()
 		e.broadcast = v != 0
@@ -1591,6 +1604,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.keepalive.Unlock()
 		return nil
 
+	case *tcpip.TCPUserTimeoutOption:
+		e.mu.Lock()
+		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
+		e.mu.Unlock()
+		return nil
+
 	case *tcpip.OutOfBandInlineOption:
 		// We don't currently support disabling this option.
 		*o = 1
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 89b965c23..bc718064c 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -162,13 +162,26 @@ func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Transpo
 func replyWithReset(s *segment) {
 	// Get the seqnum from the packet if the ack flag is set.
 	seq := seqnum.Value(0)
+	ack := seqnum.Value(0)
+	flags := byte(header.TCPFlagRst)
+	// As per RFC 793 page 35 (Reset Generation)
+	//   1.  If the connection does not exist (CLOSED) then a reset is sent
+	//   in response to any incoming segment except another reset.  In
+	//   particular, SYNs addressed to a non-existent connection are rejected
+	//   by this means.
+
+	//   If the incoming segment has an ACK field, the reset takes its
+	//   sequence number from the ACK field of the segment, otherwise the
+	//   reset has sequence number zero and the ACK field is set to the sum
+	//   of the sequence number and segment length of the incoming segment.
+	//   The connection remains in the CLOSED state.
 	if s.flagIsSet(header.TCPFlagAck) {
 		seq = s.ackNumber
+	} else {
+		flags |= header.TCPFlagAck
+		ack = s.sequenceNumber.Add(s.logicalLen())
 	}
-
-	ack := s.sequenceNumber.Add(s.logicalLen())
-
-	sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
+	sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, flags, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
 }
 
 // SetOption implements TransportProtocol.SetOption.
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 5ee499c36..0a5534959 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -50,16 +50,20 @@ type receiver struct {
 	pendingRcvdSegments segmentHeap
 	pendingBufUsed      seqnum.Size
 	pendingBufSize      seqnum.Size
+
+	// Time when the last ack was received.
+	lastRcvdAckTime time.Time `state:".(unixTime)"`
 }
 
 func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8, pendingBufSize seqnum.Size) *receiver {
 	return &receiver{
-		ep:             ep,
-		rcvNxt:         irs + 1,
-		rcvAcc:         irs.Add(rcvWnd + 1),
-		rcvWnd:         rcvWnd,
-		rcvWndScale:    rcvWndScale,
-		pendingBufSize: pendingBufSize,
+		ep:              ep,
+		rcvNxt:          irs + 1,
+		rcvAcc:          irs.Add(rcvWnd + 1),
+		rcvWnd:          rcvWnd,
+		rcvWndScale:     rcvWndScale,
+		pendingBufSize:  pendingBufSize,
+		lastRcvdAckTime: time.Now(),
 	}
 }
 
@@ -360,6 +364,9 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 		return true, nil
 	}
 
+	// Store the time of the last ack.
+	r.lastRcvdAckTime = time.Now()
+
 	// Defer segment processing if it can't be consumed now.
 	if !r.consumeSegment(s, segSeq, segLen) {
 		if segLen > 0 || s.flagIsSet(header.TCPFlagFin) {
diff --git a/pkg/tcpip/transport/tcp/rcv_state.go b/pkg/tcpip/transport/tcp/rcv_state.go
new file mode 100644
index 000000000..2bf21a2e7
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/rcv_state.go
@@ -0,0 +1,29 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+)
+
+// saveLastRcvdAckTime is invoked by stateify.
+func (r *receiver) saveLastRcvdAckTime() unixTime {
+	return unixTime{r.lastRcvdAckTime.Unix(), r.lastRcvdAckTime.UnixNano()}
+}
+
+// loadLastRcvdAckTime is invoked by stateify.
+func (r *receiver) loadLastRcvdAckTime(unix unixTime) {
+	r.lastRcvdAckTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 8332a0179..8a947dc66 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -28,8 +28,11 @@ import (
 )
 
 const (
-	// minRTO is the minimum allowed value for the retransmit timeout.
-	minRTO = 200 * time.Millisecond
+	// MinRTO is the minimum allowed value for the retransmit timeout.
+	MinRTO = 200 * time.Millisecond
+
+	// MaxRTO is the maximum allowed value for the retransmit timeout.
+	MaxRTO = 120 * time.Second
 
 	// InitialCwnd is the initial congestion window.
 	InitialCwnd = 10
@@ -134,6 +137,10 @@ type sender struct {
 	// rttMeasureTime is the time when the rttMeasureSeqNum was sent.
 	rttMeasureTime time.Time `state:".(unixTime)"`
 
+	// firstRetransmittedSegXmitTime is the original transmit time of
+	// the first segment that was retransmitted due to RTO expiration.
+	firstRetransmittedSegXmitTime time.Time `state:".(unixTime)"`
+
 	closed      bool
 	writeNext   *segment
 	writeList   segmentList
@@ -392,8 +399,8 @@ func (s *sender) updateRTO(rtt time.Duration) {
 
 	s.rto = s.rtt.srtt + 4*s.rtt.rttvar
 	s.rtt.Unlock()
-	if s.rto < minRTO {
-		s.rto = minRTO
+	if s.rto < MinRTO {
+		s.rto = MinRTO
 	}
 }
 
@@ -438,8 +445,30 @@ func (s *sender) retransmitTimerExpired() bool {
 	s.ep.stack.Stats().TCP.Timeouts.Increment()
 	s.ep.stats.SendErrors.Timeouts.Increment()
 
-	// Give up if we've waited more than a minute since the last resend.
-	if s.rto >= 60*time.Second {
+	// Give up if we've waited more than a minute since the last resend or
+	// if a user time out is set and we have exceeded the user specified
+	// timeout since the first retransmission.
+	s.ep.mu.RLock()
+	uto := s.ep.userTimeout
+	s.ep.mu.RUnlock()
+
+	if s.firstRetransmittedSegXmitTime.IsZero() {
+		// We store the original xmitTime of the segment that we are
+		// about to retransmit as the retransmission time. This is
+		// required as by the time the retransmitTimer has expired the
+		// segment has already been sent and unacked for the RTO at the
+		// time the segment was sent.
+		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
+	}
+
+	elapsed := time.Since(s.firstRetransmittedSegXmitTime)
+	remaining := MaxRTO
+	if uto != 0 {
+		// Cap to the user specified timeout if one is specified.
+		remaining = uto - elapsed
+	}
+
+	if remaining <= 0 || s.rto >= MaxRTO {
 		return false
 	}
 
@@ -447,6 +476,11 @@ func (s *sender) retransmitTimerExpired() bool {
 	// below.
 	s.rto *= 2
 
+	// Cap RTO to remaining time.
+	if s.rto > remaining {
+		s.rto = remaining
+	}
+
 	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
 	//
 	// Retransmit timeouts:
@@ -1168,6 +1202,8 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 		// RFC 6298 Rule 5.3
 		if s.sndUna == s.sndNxt {
 			s.outstanding = 0
+			// Reset firstRetransmittedSegXmitTime to the zero value.
+			s.firstRetransmittedSegXmitTime = time.Time{}
 			s.resendTimer.disable()
 		}
 	}
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
index 12eff8afc..8b20c3455 100644
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -48,3 +48,13 @@ func (s *sender) loadRttMeasureTime(unix unixTime) {
 func (s *sender) afterLoad() {
 	s.resendTimer.init(&s.resendWaker)
 }
+
+// saveFirstRetransmittedSegXmitTime is invoked by stateify.
+func (s *sender) saveFirstRetransmittedSegXmitTime() unixTime {
+	return unixTime{s.firstRetransmittedSegXmitTime.Unix(), s.firstRetransmittedSegXmitTime.UnixNano()}
+}
+
+// loadFirstRetransmittedSegXmitTime is invoked by stateify.
+func (s *sender) loadFirstRetransmittedSegXmitTime(unix unixTime) {
+	s.firstRetransmittedSegXmitTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index bc5cfcf0e..2a83f7bcc 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -323,8 +323,8 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.SeqNum(uint32(c.IRS+1)),
-		checker.AckNum(uint32(iss)+1),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
 }
 
 func TestTCPResetsReceivedIncrement(t *testing.T) {
@@ -460,18 +460,17 @@ func TestConnectResetAfterClose(t *testing.T) {
 			checker.TCP(
 				checker.DstPort(context.TestPort),
 				checker.SeqNum(uint32(c.IRS)+2),
-				checker.AckNum(790),
-				checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+				checker.AckNum(0),
+				checker.TCPFlags(header.TCPFlagRst),
 			),
 		)
 		break
 	}
 }
 
-// TestClosingWithEnqueuedSegments tests handling of
-// still enqueued segments when the endpoint transitions
-// to StateClose. The in-flight segments would be re-enqueued
-// to a any listening endpoint.
+// TestClosingWithEnqueuedSegments tests handling of still enqueued segments
+// when the endpoint transitions to StateClose. The in-flight segments would be
+// re-enqueued to a any listening endpoint.
 func TestClosingWithEnqueuedSegments(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -576,8 +575,8 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.SeqNum(uint32(c.IRS)+2),
-			checker.AckNum(793),
-			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+			checker.AckNum(0),
+			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
 }
@@ -914,7 +913,7 @@ func TestSendRstOnListenerRxAckV4(t *testing.T) {
 
 	checker.IPv4(t, c.GetPacket(), checker.TCP(
 		checker.DstPort(context.TestPort),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.TCPFlags(header.TCPFlagRst),
 		checker.SeqNum(200)))
 }
 
@@ -942,7 +941,7 @@ func TestSendRstOnListenerRxAckV6(t *testing.T) {
 
 	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
 		checker.DstPort(context.TestPort),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.TCPFlags(header.TCPFlagRst),
 		checker.SeqNum(200)))
 }
 
@@ -4291,8 +4290,9 @@ func TestKeepalive(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
+	const keepAliveInterval = 10 * time.Millisecond
 	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
-	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(10 * time.Millisecond))
+	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
 	c.EP.SetSockOpt(tcpip.KeepaliveCountOption(5))
 	c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
 
@@ -4382,13 +4382,29 @@ func TestKeepalive(t *testing.T) {
 		)
 	}
 
+	// Sleep for a litte over the KeepAlive interval to make sure
+	// the timer has time to fire after the last ACK and close the
+	// close the socket.
+	time.Sleep(keepAliveInterval + 5*time.Millisecond)
+
 	// The connection should be terminated after 5 unacked keepalives.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.SeqNum(uint32(next)),
-			checker.AckNum(uint32(790)),
-			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
 		),
 	)
 
@@ -6157,8 +6173,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.SeqNum(uint32(ackHeaders.AckNum)),
-		checker.AckNum(uint32(ackHeaders.SeqNum)),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
 
 	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != want {
 		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = %v", got, want)
@@ -6336,7 +6352,147 @@ func TestTCPCloseWithData(t *testing.T) {
 		checker.SrcPort(context.StackPort),
 		checker.DstPort(context.TestPort),
 		checker.SeqNum(uint32(ackHeaders.AckNum)),
-		checker.AckNum(uint32(ackHeaders.SeqNum)),
-		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
+}
+
+func TestTCPUserTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
 
+	userTimeout := 50 * time.Millisecond
+	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+
+	// Send some data and wait before ACKing it.
+	view := buffer.NewView(3)
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	next := uint32(c.IRS) + 1
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	// Wait for a little over the minimum retransmit timeout of 200ms for
+	// the retransmitTimer to fire and close the connection.
+	time.Sleep(tcp.MinRTO + 10*time.Millisecond)
+
+	// No packet should be received as the connection should be silently
+	// closed due to timeout.
+	c.CheckNoPacket("unexpected packet received after userTimeout has expired")
+
+	next += uint32(len(view))
+
+	// The connection should be terminated after userTimeout has expired.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(next)),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
+	}
+
+	if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
+	}
+}
+
+func TestKeepaliveWithUserTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
+
+	const keepAliveInterval = 10 * time.Millisecond
+	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
+	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
+	c.EP.SetSockOpt(tcpip.KeepaliveCountOption(10))
+	c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
+
+	// Set userTimeout to be the duration for 3 keepalive probes.
+	userTimeout := 30 * time.Millisecond
+	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+
+	// Check that the connection is still alive.
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+	}
+
+	// Now receive 2 keepalives, but don't ACK them. The connection should
+	// be reset when the 3rd one should be sent due to userTimeout being
+	// 30ms and each keepalive probe should be sent 10ms apart as set above after
+	// the connection has been idle for 10ms.
+	for i := 0; i < 2; i++ {
+		b := c.GetPacket()
+		checker.IPv4(t, b,
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)),
+				checker.AckNum(uint32(790)),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	// Sleep for a litte over the KeepAlive interval to make sure
+	// the timer has time to fire after the last ACK and close the
+	// close the socket.
+	time.Sleep(keepAliveInterval + 5*time.Millisecond)
+
+	// The connection should be terminated after 30ms.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(c.IRS + 1),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS+1)),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
+	}
+	if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
+	}
 }
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index fa4358ae4..761c3a9fe 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -206,7 +206,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   }
   // TODO(b/138400178): Fix cooperative S/R failure when ds.reset() is invoked
   // before function end.
-  // ds.reset()
+  // ds.reset();
 }
 
 TEST_P(SocketInetLoopbackTest, TCPbacklog) {
@@ -603,6 +603,60 @@ TEST_P(SocketInetLoopbackTest, TCPTimeWaitTest_NoRandomSave) {
               SyscallSucceeds());
 }
 
+TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the userTimeout on the listening socket.
+  constexpr int kUserTimeout = 10;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kUserTimeout, sizeof(kUserTimeout)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+  // Verify that the accepted socket inherited the user timeout set on
+  // listening socket.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(accepted.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kUserTimeout);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetLoopbackTest,
     ::testing::Values(
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index c74273436..57ce8e169 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -812,5 +812,68 @@ TEST_P(TCPSocketPairTest, TestTCPCloseWithData) {
   ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
 }
 
+TEST_P(TCPSocketPairTest, TCPUserTimeoutDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);  // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kZero = 0;
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kZero, sizeof(kZero)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);  // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutBelowZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kNeg = -10;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kNeg, sizeof(kNeg)),
+              SyscallFailsWithErrno(EINVAL));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);  // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kAbove = 10;
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kAbove, sizeof(kAbove)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &get, &get_len),
+              SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kAbove);
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 99863b0ed..c503f3568 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1175,6 +1175,31 @@ TEST_P(SimpleTcpSocketTest, SetMaxSegFailsForInvalidMSSValues) {
   }
 }
 
+TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  {
+    constexpr int kTCPUserTimeout = -1;
+    EXPECT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                           &kTCPUserTimeout, sizeof(kTCPUserTimeout)),
+                SyscallFailsWithErrno(EINVAL));
+  }
+
+  // kTCPUserTimeout is in milliseconds.
+  constexpr int kTCPUserTimeout = 100;
+  ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+                         &kTCPUserTimeout, sizeof(kTCPUserTimeout)),
+              SyscallSucceedsWithValue(0));
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kTCPUserTimeout);
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 378d6c1f3697b8b939e6632e980562bfc8fb2781 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 12 Dec 2019 11:07:25 -0800
Subject: unix: allow to bind unix sockets only to AF_UNIX addresses

Reported-by: syzbot+2c0bcfd87fb4e8b7b009@syzkaller.appspotmail.com
PiperOrigin-RevId: 285228312
---
 pkg/sentry/socket/netstack/netstack.go |  2 +-
 pkg/sentry/socket/unix/unix.go         |  3 +++
 test/syscalls/linux/socket_unix.cc     | 15 +++++++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 8a6522eac..140851c17 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -326,7 +326,7 @@ func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress,
 	}
 
 	family := usermem.ByteOrder.Uint16(addr)
-	if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) {
+	if family != uint16(sfamily) && (strict || family != linux.AF_UNSPEC) {
 		return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
 	}
 
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 1aaae8487..885758054 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -118,6 +118,9 @@ func (s *SocketOperations) Endpoint() transport.Endpoint {
 func extractPath(sockaddr []byte) (string, *syserr.Error) {
 	addr, _, err := netstack.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */)
 	if err != nil {
+		if err == syserr.ErrAddressFamilyNotSupported {
+			err = syserr.ErrInvalidArgument
+		}
 		return "", err
 	}
 
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 8a28202a8..4cf1f76f1 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -65,6 +65,21 @@ TEST_P(UnixSocketPairTest, BindToBadName) {
       SyscallFailsWithErrno(ENOENT));
 }
 
+TEST_P(UnixSocketPairTest, BindToBadFamily) {
+  auto pair =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+  constexpr char kBadName[] = "/some/path/that/does/not/exist";
+  sockaddr_un sockaddr;
+  sockaddr.sun_family = AF_INET;
+  memcpy(sockaddr.sun_path, kBadName, sizeof(kBadName));
+
+  EXPECT_THAT(
+      bind(pair->first_fd(), reinterpret_cast<struct sockaddr*>(&sockaddr),
+           sizeof(sockaddr)),
+      SyscallFailsWithErrno(EINVAL));
+}
+
 TEST_P(UnixSocketPairTest, RecvmmsgTimeoutAfterRecv) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
   char sent_data[10];
-- 
cgit v1.2.3


From e6f4124afd951c3b089f9c75c499c14f4d90a590 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 16 Dec 2019 13:18:36 -0800
Subject: Implement checks for get/setxattr at the syscall layer.

Add checks for input arguments, file type, permissions, etc. that match
the Linux implementation. A call to get/setxattr that passes all the
checks will still currently return EOPNOTSUPP. Actual support will be
added in following commits.

Only allow user.* extended attributes for the time being.

PiperOrigin-RevId: 285835159
---
 pkg/abi/linux/BUILD                        |   1 +
 pkg/abi/linux/xattr.go                     |  27 +++++
 pkg/sentry/fs/inode.go                     |   8 ++
 pkg/sentry/fs/inode_overlay.go             |   5 +
 pkg/sentry/syscalls/linux/BUILD            |   1 +
 pkg/sentry/syscalls/linux/linux64_amd64.go |   4 +-
 pkg/sentry/syscalls/linux/linux64_arm64.go |   4 +-
 pkg/sentry/syscalls/linux/sys_xattr.go     | 169 +++++++++++++++++++++++++++++
 test/syscalls/linux/xattr.cc               |  79 +++++++-------
 9 files changed, 253 insertions(+), 45 deletions(-)
 create mode 100644 pkg/abi/linux/xattr.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_xattr.go

(limited to 'test/syscalls/linux')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 51774c6b6..9553f164d 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -57,6 +57,7 @@ go_library(
         "uio.go",
         "utsname.go",
         "wait.go",
+        "xattr.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/abi/linux",
     visibility = ["//visibility:public"],
diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go
new file mode 100644
index 000000000..a3b6406fa
--- /dev/null
+++ b/pkg/abi/linux/xattr.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Constants for extended attributes.
+const (
+	XATTR_NAME_MAX = 255
+	XATTR_SIZE_MAX = 65536
+
+	XATTR_CREATE  = 1
+	XATTR_REPLACE = 2
+
+	XATTR_USER_PREFIX     = "user."
+	XATTR_USER_PREFIX_LEN = len(XATTR_USER_PREFIX)
+)
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 2d43dff1d..91e2fde2f 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -270,6 +270,14 @@ func (i *Inode) Getxattr(name string) (string, error) {
 	return i.InodeOperations.Getxattr(i, name)
 }
 
+// Setxattr calls i.InodeOperations.Setxattr with i as the Inode.
+func (i *Inode) Setxattr(name, value string) error {
+	if i.overlay != nil {
+		return overlaySetxattr(i.overlay, name, value)
+	}
+	return i.InodeOperations.Setxattr(i, name, value)
+}
+
 // Listxattr calls i.InodeOperations.Listxattr with i as the Inode.
 func (i *Inode) Listxattr() (map[string]struct{}, error) {
 	if i.overlay != nil {
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index a09147080..63a991beb 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -552,6 +552,11 @@ func overlayGetxattr(o *overlayEntry, name string) (string, error) {
 	return s, err
 }
 
+// TODO(b/146028302): Support setxattr for overlayfs.
+func overlaySetxattr(o *overlayEntry, name, value string) error {
+	return syserror.EOPNOTSUPP
+}
+
 func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 4c0bf96e4..6766ba587 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -49,6 +49,7 @@ go_library(
         "sys_tls.go",
         "sys_utsname.go",
         "sys_write.go",
+        "sys_xattr.go",
         "timespec.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls/linux",
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 797542d28..272ae9991 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,10 +228,10 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		188: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		188: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
 		189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		191: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
 		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 2bc7faff5..3b584eed9 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -41,10 +41,10 @@ var ARM64 = &kernel.SyscallTable{
 		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		5:   syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		5:   syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
 		6:   syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		7:   syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		8:   syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		8:   syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
 		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
new file mode 100644
index 000000000..97d9a65ea
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -0,0 +1,169 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Getxattr implements linux syscall getxattr(2).
+func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	valueLen := 0
+	err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		value, err := getxattr(t, d, dirPath, nameAddr)
+		if err != nil {
+			return err
+		}
+
+		valueLen = len(value)
+		if size == 0 {
+			return nil
+		}
+		if size > linux.XATTR_SIZE_MAX {
+			size = linux.XATTR_SIZE_MAX
+		}
+		if valueLen > int(size) {
+			return syserror.ERANGE
+		}
+
+		_, err = t.CopyOutBytes(valueAddr, []byte(value))
+		return err
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(valueLen), nil, nil
+}
+
+// getxattr implements getxattr from the given *fs.Dirent.
+func getxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr) (string, error) {
+	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+		return "", syserror.ENOTDIR
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
+		return "", err
+	}
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return "", err
+	}
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+
+	return d.Inode.Getxattr(name)
+}
+
+// Setxattr implements linux syscall setxattr(2).
+func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := args[3].SizeT()
+	flags := args[4].Uint()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		return setxattr(t, d, dirPath, nameAddr, valueAddr, size, flags)
+	})
+}
+
+// setxattr implements setxattr from the given *fs.Dirent.
+func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint, flags uint32) error {
+	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+		return syserror.ENOTDIR
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+		return err
+	}
+
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	if size > linux.XATTR_SIZE_MAX {
+		return syserror.E2BIG
+	}
+	buf := make([]byte, size)
+	if _, err = t.CopyInBytes(valueAddr, buf); err != nil {
+		return err
+	}
+	value := string(buf)
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+
+	return d.Inode.Setxattr(name, value)
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+	name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+	if err != nil {
+		if err == syserror.ENAMETOOLONG {
+			return "", syserror.ERANGE
+		}
+		return "", err
+	}
+	if len(name) == 0 {
+		return "", syserror.ERANGE
+	}
+	return name, nil
+}
+
+func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error {
+	// Restrict xattrs to regular files and directories.
+	//
+	// In Linux, this restriction technically only applies to xattrs in the
+	// "user.*" namespace, but we don't allow any other xattr prefixes anyway.
+	if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) {
+		if perms.Write {
+			return syserror.EPERM
+		}
+		return syserror.ENODATA
+	}
+
+	return i.CheckPermission(t, perms)
+}
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 3e07b634b..75740238c 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -28,6 +28,7 @@
 #include "test/util/capability_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
+#include "test/util/test_util.h"
 
 namespace gvisor {
 namespace testing {
@@ -37,9 +38,6 @@ namespace {
 class XattrTest : public FileTest {};
 
 TEST_F(XattrTest, XattrNullName) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
@@ -49,9 +47,6 @@ TEST_F(XattrTest, XattrNullName) {
 }
 
 TEST_F(XattrTest, XattrEmptyName) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
@@ -60,16 +55,17 @@ TEST_F(XattrTest, XattrEmptyName) {
 }
 
 TEST_F(XattrTest, XattrLargeName) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
-  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
-              SyscallSucceeds());
-  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
-              SyscallSucceedsWithValue(0));
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+                SyscallSucceeds());
+    EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+                SyscallSucceedsWithValue(0));
+  }
 
   name += "a";
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
@@ -79,9 +75,6 @@ TEST_F(XattrTest, XattrLargeName) {
 }
 
 TEST_F(XattrTest, XattrInvalidPrefix) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name(XATTR_NAME_MAX, 'a');
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
@@ -91,9 +84,6 @@ TEST_F(XattrTest, XattrInvalidPrefix) {
 }
 
 TEST_F(XattrTest, XattrReadOnly) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -102,22 +92,28 @@ TEST_F(XattrTest, XattrReadOnly) {
   char name[] = "user.abc";
   char val = 'a';
   size_t size = sizeof(val);
-  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+                SyscallSucceeds());
+  }
 
   ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IRUSR));
 
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
               SyscallFailsWithErrno(EACCES));
 
-  char buf = '-';
-  EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
-  EXPECT_EQ(buf, val);
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    char buf = '-';
+    EXPECT_THAT(getxattr(path, name, &buf, size),
+                SyscallSucceedsWithValue(size));
+    EXPECT_EQ(buf, val);
+  }
 }
 
 TEST_F(XattrTest, XattrWriteOnly) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -128,7 +124,12 @@ TEST_F(XattrTest, XattrWriteOnly) {
   char name[] = "user.abc";
   char val = 'a';
   size_t size = sizeof(val);
-  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+                SyscallSucceeds());
+  }
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
 }
@@ -172,9 +173,6 @@ TEST_F(XattrTest, XattrOnSymlink) {
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   char name[] = "user.abc";
 
   char char_device[] = "/dev/zero";
@@ -226,9 +224,6 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   char name[] = "user.abc";
 
@@ -240,19 +235,24 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
   EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
               SyscallFailsWithErrno(E2BIG));
 
-  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(getxattr(path, name, nullptr, 0),
+                SyscallFailsWithErrno(ENODATA));
+  }
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   char name[] = "user.abc";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
               SyscallFailsWithErrno(EFAULT));
 
-  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+  // TODO(b/127675828): Support setxattr and getxattr.
+  if (!IsRunningOnGvisor()) {
+    EXPECT_THAT(getxattr(path, name, nullptr, 0),
+                SyscallFailsWithErrno(ENODATA));
+  }
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
@@ -350,9 +350,6 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
 }
 
 TEST_F(XattrTest, SetxattrInvalidFlags) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
-- 
cgit v1.2.3


From 67000b929b9f5a3aedf6f5f56611c76411d02d78 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 17 Dec 2019 06:31:41 -0800
Subject: Explicitly export files needed by other packages

PiperOrigin-RevId: 285968611
---
 test/syscalls/linux/BUILD | 1 +
 1 file changed, 1 insertion(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 0bbaaf28a..e6568128e 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -9,6 +9,7 @@ package(
 exports_files(
     [
         "socket.cc",
+        "socket_ip_loopback_blocking.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
         "udp_socket.cc",
-- 
cgit v1.2.3


From 3f4d8fefb45d75937292302e4c158f76da5c7ca8 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 17 Dec 2019 10:08:47 -0800
Subject: Internal change.

PiperOrigin-RevId: 286003946
---
 pkg/tcpip/transport/tcp/connect.go  |  8 +++++
 pkg/tcpip/transport/tcp/tcp_test.go | 65 +++++++++++++++++++++++++++++++++
 test/syscalls/linux/tcp_socket.cc   | 72 +++++++++++++++++++++++++++++++++++++
 3 files changed, 145 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4c34fc9d2..cdd69f360 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -218,6 +218,14 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// acceptable if the ack field acknowledges the SYN.
 	if s.flagIsSet(header.TCPFlagRst) {
 		if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
+			// RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
+			// was acceptable then signal the user "error: connection reset", drop
+			// the segment, enter CLOSED state, delete TCB, and return."
+			h.ep.mu.Lock()
+			h.ep.workerCleanup = true
+			h.ep.mu.Unlock()
+			// Although the RFC above calls out ECONNRESET, Linux actually returns
+			// ECONNREFUSED here so we do as well.
 			return tcpip.ErrConnectionRefused
 		}
 		return nil
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 2a83f7bcc..e8fe4dab5 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1140,6 +1140,71 @@ func TestConnectBindToDevice(t *testing.T) {
 	}
 }
 
+func TestRstOnSynSent(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create an endpoint, don't handshake because we want to interfere with the
+	// handshake process.
+	c.Create(-1)
+
+	// Start connection attempt.
+	waitEntry, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventOut)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	addr := tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}
+	if err := c.EP.Connect(addr); err != tcpip.ErrConnectStarted {
+		t.Fatalf("got Connect(%+v) = %v, want %s", addr, err, tcpip.ErrConnectStarted)
+	}
+
+	// Receive SYN packet.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagSyn),
+		),
+	)
+
+	// Ensure that we've reached SynSent state
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+		t.Fatalf("got State() = %s, want %s", got, want)
+	}
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	// Send a packet with a proper ACK and a RST flag to cause the socket
+	// to Error and close out
+	iss := seqnum.Value(789)
+	rcvWnd := seqnum.Size(30000)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: tcpHdr.DestinationPort(),
+		DstPort: tcpHdr.SourcePort(),
+		Flags:   header.TCPFlagRst | header.TCPFlagAck,
+		SeqNum:  iss,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  rcvWnd,
+		TCPOpts: nil,
+	})
+
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(3 * time.Second):
+		t.Fatal("timed out waiting for packet to arrive")
+	}
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionRefused {
+		t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrConnectionRefused)
+	}
+
+	// Due to the RST the endpoint should be in an error state.
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+		t.Fatalf("got State() = %s, want %s", got, want)
+	}
+}
+
 func TestOutOfOrderReceive(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index c503f3568..6b99c021d 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -967,6 +967,78 @@ TEST_P(SimpleTcpSocketTest, BlockingConnectRefused) {
   EXPECT_THAT(close(s.release()), SyscallSucceeds());
 }
 
+// Test that connecting to a non-listening port and thus receiving a RST is
+// handled appropriately by the socket - the port that the socket was bound to
+// is released and the expected error is returned.
+TEST_P(SimpleTcpSocketTest, CleanupOnConnectionRefused) {
+  // Create a socket that is known to not be listening. As is it bound but not
+  // listening, when another socket connects to the port, it will refuse..
+  FileDescriptor bound_s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage bound_addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t bound_addrlen = sizeof(bound_addr);
+
+  ASSERT_THAT(
+      bind(bound_s.get(), reinterpret_cast<struct sockaddr*>(&bound_addr),
+           bound_addrlen),
+      SyscallSucceeds());
+
+  // Get the addresses the socket is bound to because the port is chosen by the
+  // stack.
+  ASSERT_THAT(getsockname(bound_s.get(),
+                          reinterpret_cast<struct sockaddr*>(&bound_addr),
+                          &bound_addrlen),
+              SyscallSucceeds());
+
+  // Create, initialize, and bind the socket that is used to test connecting to
+  // the non-listening port.
+  FileDescriptor client_s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  // Initialize client address to the loopback one.
+  sockaddr_storage client_addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t client_addrlen = sizeof(client_addr);
+
+  ASSERT_THAT(
+      bind(client_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr),
+           client_addrlen),
+      SyscallSucceeds());
+
+  ASSERT_THAT(getsockname(client_s.get(),
+                          reinterpret_cast<struct sockaddr*>(&client_addr),
+                          &client_addrlen),
+              SyscallSucceeds());
+
+  // Now the test: connect to the bound but not listening socket with the
+  // client socket. The bound socket should return a RST and cause the client
+  // socket to return an error and clean itself up immediately.
+  // The error being ECONNREFUSED diverges with RFC 793, page 37, but does what
+  // Linux does.
+  ASSERT_THAT(connect(client_s.get(),
+                      reinterpret_cast<const struct sockaddr*>(&bound_addr),
+                      bound_addrlen),
+              SyscallFailsWithErrno(ECONNREFUSED));
+
+  FileDescriptor new_s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // Test binding to the address from the client socket. This should be okay
+  // if it was dropped correctly.
+  ASSERT_THAT(
+      bind(new_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr),
+           client_addrlen),
+      SyscallSucceeds());
+
+  // Attempt #2, with the new socket and reused addr our connect should fail in
+  // the same way as before, not with an EADDRINUSE.
+  ASSERT_THAT(connect(client_s.get(),
+                      reinterpret_cast<const struct sockaddr*>(&bound_addr),
+                      bound_addrlen),
+              SyscallFailsWithErrno(ECONNREFUSED));
+}
+
 // Test that we get an ECONNREFUSED with a nonblocking socket.
 TEST_P(SimpleTcpSocketTest, NonBlockingConnectRefused) {
   FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
-- 
cgit v1.2.3


From 65f53c583364295cbc211b38fae126fb88f08ec0 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Wed, 18 Dec 2019 12:28:13 -0800
Subject: Put GetSocketPairs() in unnamed namespace

This avoids conflicting definitions of GetSocketPairs() in outer namespace when
multiple such cc files are complied for one binary.

PiperOrigin-RevId: 286243045
---
 test/syscalls/linux/BUILD                          | 1 +
 test/syscalls/linux/socket_ip_loopback_blocking.cc | 2 ++
 test/syscalls/linux/socket_ip_tcp_loopback.cc      | 2 ++
 3 files changed, 5 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e6568128e..675ff5cdb 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -10,6 +10,7 @@ exports_files(
     [
         "socket.cc",
         "socket_ip_loopback_blocking.cc",
+        "socket_ip_tcp_loopback.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
         "udp_socket.cc",
diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc
index e58eedaba..fda252dd7 100644
--- a/test/syscalls/linux/socket_ip_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(
@@ -43,5 +44,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingIPSockets, BlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback.cc b/test/syscalls/linux/socket_ip_tcp_loopback.cc
index 831de53b8..9db3037bc 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -34,5 +35,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, AllSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 18d6e59b457c8a91bf7db518fbb9193c49d2ee7c Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Wed, 18 Dec 2019 13:06:02 -0800
Subject: Switch to netinet/tcp.h and poll.h to for better platform
 portability.

PiperOrigin-RevId: 286249699
---
 test/syscalls/linux/BUILD                   |  1 +
 test/syscalls/linux/socket_inet_loopback.cc | 67 +++++++++++++++--------------
 2 files changed, 36 insertions(+), 32 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 675ff5cdb..064ce8429 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -9,6 +9,7 @@ package(
 exports_files(
     [
         "socket.cc",
+        "socket_inet_loopback.cc",
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_loopback.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 761c3a9fe..5bb9d2e99 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -13,12 +13,10 @@
 // limitations under the License.
 
 #include <arpa/inet.h>
-#include <linux/tcp.h>
 #include <netinet/in.h>
+#include <netinet/tcp.h>
 #include <poll.h>
 #include <string.h>
-#include <sys/epoll.h>
-#include <sys/socket.h>
 
 #include <atomic>
 #include <iostream>
@@ -46,6 +44,8 @@ namespace testing {
 
 namespace {
 
+using ::testing::Gt;
+
 PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
   switch (family) {
     case AF_INET:
@@ -976,41 +976,44 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
                 SyscallSucceedsWithValue(sizeof(i)));
   }
 
-  int epollfd;
-  ASSERT_THAT(epollfd = epoll_create1(0), SyscallSucceeds());
-
+  struct pollfd pollfds[kThreadCount];
   for (int i = 0; i < kThreadCount; i++) {
-    int fd = listener_fds[i].get();
-    struct epoll_event ev;
-    ev.data.fd = fd;
-    ev.events = EPOLLIN;
-    ASSERT_THAT(epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev), SyscallSucceeds());
+    pollfds[i].fd = listener_fds[i].get();
+    pollfds[i].events = POLLIN;
   }
 
   std::map<uint16_t, int> portToFD;
 
-  for (int i = 0; i < kConnectAttempts * 2; i++) {
-    struct sockaddr_storage addr = {};
-    socklen_t addrlen = sizeof(addr);
-    struct epoll_event ev;
-    int data, fd;
+  int received = 0;
+  while (received < kConnectAttempts * 2) {
+    ASSERT_THAT(poll(pollfds, kThreadCount, -1),
+                SyscallSucceedsWithValue(Gt(0)));
 
-    ASSERT_THAT(epoll_wait(epollfd, &ev, 1, -1), SyscallSucceedsWithValue(1));
+    for (int i = 0; i < kThreadCount; i++) {
+      if ((pollfds[i].revents & POLLIN) == 0) {
+        continue;
+      }
 
-    fd = ev.data.fd;
-    EXPECT_THAT(RetryEINTR(recvfrom)(fd, &data, sizeof(data), 0,
-                                     reinterpret_cast<struct sockaddr*>(&addr),
-                                     &addrlen),
-                SyscallSucceedsWithValue(sizeof(data)));
-    uint16_t const port =
-        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr));
-    auto prev_port = portToFD.find(port);
-    // Check that all packets from one client have been delivered to the same
-    // server socket.
-    if (prev_port == portToFD.end()) {
-      portToFD[port] = ev.data.fd;
-    } else {
-      EXPECT_EQ(portToFD[port], ev.data.fd);
+      received++;
+
+      const int fd = pollfds[i].fd;
+      struct sockaddr_storage addr = {};
+      socklen_t addrlen = sizeof(addr);
+      int data;
+      EXPECT_THAT(RetryEINTR(recvfrom)(
+                      fd, &data, sizeof(data), 0,
+                      reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+                  SyscallSucceedsWithValue(sizeof(data)));
+      uint16_t const port =
+          ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr));
+      auto prev_port = portToFD.find(port);
+      // Check that all packets from one client have been delivered to the
+      // same server socket.
+      if (prev_port == portToFD.end()) {
+        portToFD[port] = fd;
+      } else {
+        EXPECT_EQ(portToFD[port], fd);
+      }
     }
   }
 }
@@ -1897,7 +1900,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, NoReusePortFollowingReusePort) {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    AllFamlies, SocketMultiProtocolInetLoopbackTest,
+    AllFamilies, SocketMultiProtocolInetLoopbackTest,
     ::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM},
                       ProtocolTestParam{"UDP", SOCK_DGRAM}),
     DescribeProtocolTestParam);
-- 
cgit v1.2.3


From 57ce26c0b465dce332a59c9fabb05f737ff4241d Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 18 Dec 2019 18:22:50 -0800
Subject: net/tcp: allow to call listen without bind

When listen(2) is called on an unbound socket, the socket is
automatically bound to a random free port with the local address
set to INADDR_ANY.

PiperOrigin-RevId: 286305906
---
 pkg/tcpip/transport/tcp/endpoint.go         | 13 ++++++++++++
 test/syscalls/linux/socket_inet_loopback.cc | 33 +++++++++++++++++++++--------
 2 files changed, 37 insertions(+), 9 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index dd8b47cbe..fe629aa40 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1974,6 +1974,15 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 		return nil
 	}
 
+	if e.state == StateInitial {
+		// The listen is called on an unbound socket, the socket is
+		// automatically bound to a random free port with the local
+		// address set to INADDR_ANY.
+		if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
+			return err
+		}
+	}
+
 	// Endpoint must be bound before it can transition to listen mode.
 	if e.state != StateBound {
 		e.stats.ReadErrors.InvalidEndpointState.Increment()
@@ -2033,6 +2042,10 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
+	return e.bindLocked(addr)
+}
+
+func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	// Don't allow binding once endpoint is not in the initial state
 	// anymore. This is because once the endpoint goes into a connected or
 	// listen state, it is already bound.
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 5bb9d2e99..619d41901 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -102,19 +102,17 @@ TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) {
               SyscallFailsWithErrno(EAFNOSUPPORT));
 }
 
-TEST_P(SocketInetLoopbackTest, TCP) {
-  auto const& param = GetParam();
-
-  TestAddress const& listener = param.listener;
-  TestAddress const& connector = param.connector;
-
+void tcpSimpleConnectTest(TestAddress const& listener,
+                          TestAddress const& connector, bool unbound) {
   // Create the listening socket.
   const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
   sockaddr_storage listen_addr = listener.addr;
-  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
-                   listener.addr_len),
-              SyscallSucceeds());
+  if (!unbound) {
+    ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                     listener.addr_len),
+                SyscallSucceeds());
+  }
   ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
@@ -148,6 +146,23 @@ TEST_P(SocketInetLoopbackTest, TCP) {
   ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
 }
 
+TEST_P(SocketInetLoopbackTest, TCP) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  tcpSimpleConnectTest(listener, connector, true);
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenUnbound) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  tcpSimpleConnectTest(listener, connector, false);
+}
+
 TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   auto const& param = GetParam();
 
-- 
cgit v1.2.3


From 7419e0e5d74621b2be60e9b18e4e2d7bb2a65cc3 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 19 Dec 2019 16:05:35 -0800
Subject: Parameterize mmap tests.

This test suite has existed for quite a while and has become kind of messy.
Various tests can be joined together by parameterizing.

PiperOrigin-RevId: 286482240
---
 test/syscalls/linux/mmap.cc | 207 +++++++++++++++-----------------------------
 1 file changed, 69 insertions(+), 138 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 6f2639d8a..1c4d9f1c7 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -814,23 +814,27 @@ class MMapFileTest : public MMapTest {
   }
 };
 
+class MMapFileParamTest
+    : public MMapFileTest,
+      public ::testing::WithParamInterface<std::tuple<int, int>> {
+ protected:
+  int prot() const { return std::get<0>(GetParam()); }
+
+  int flags() const { return std::get<1>(GetParam()); }
+};
+
 // MAP_POPULATE allowed.
 // There isn't a good way to verify it actually did anything.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, MapPopulate) {
-  ASSERT_THAT(
-      Map(0, kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd_.get(), 0),
-      SyscallSucceeds());
+TEST_P(MMapFileParamTest, MapPopulate) {
+  ASSERT_THAT(Map(0, kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0),
+              SyscallSucceeds());
 }
 
 // MAP_POPULATE on a short file.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, MapPopulateShort) {
-  ASSERT_THAT(Map(0, 2 * kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE,
-                  fd_.get(), 0),
-              SyscallSucceeds());
+TEST_P(MMapFileParamTest, MapPopulateShort) {
+  ASSERT_THAT(
+      Map(0, 2 * kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0),
+      SyscallSucceeds());
 }
 
 // Read contents from mapped file.
@@ -901,16 +905,6 @@ TEST_F(MMapFileTest, WritePrivateOnReadOnlyFd) {
             reinterpret_cast<volatile char*>(addr));
 }
 
-// MAP_PRIVATE PROT_READ is not allowed on write-only FDs.
-TEST_F(MMapFileTest, ReadPrivateOnWriteOnlyFd) {
-  const FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
-
-  uintptr_t addr;
-  EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fd.get(), 0),
-              SyscallFailsWithErrno(EACCES));
-}
-
 // MAP_SHARED PROT_WRITE not allowed on read-only FDs.
 TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) {
   const FileDescriptor fd =
@@ -922,28 +916,13 @@ TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) {
       SyscallFailsWithErrno(EACCES));
 }
 
-// MAP_SHARED PROT_READ not allowed on write-only FDs.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, ReadSharedOnWriteOnlyFd) {
-  const FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
-
-  uintptr_t addr;
-  EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd.get(), 0),
-              SyscallFailsWithErrno(EACCES));
-}
-
-// MAP_SHARED PROT_WRITE not allowed on write-only FDs.
-// The FD must always be readable.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, WriteSharedOnWriteOnlyFd) {
+// The FD must be readable.
+TEST_P(MMapFileParamTest, WriteOnlyFd) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
 
   uintptr_t addr;
-  EXPECT_THAT(addr = Map(0, kPageSize, PROT_WRITE, MAP_SHARED, fd.get(), 0),
+  EXPECT_THAT(addr = Map(0, kPageSize, prot(), flags(), fd.get(), 0),
               SyscallFailsWithErrno(EACCES));
 }
 
@@ -1182,7 +1161,7 @@ TEST_F(MMapFileTest, ReadSharedTruncateDownThenUp) {
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
               SyscallSucceeds());
 
-  // Check that the memory contains he file data.
+  // Check that the memory contains the file data.
   EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), buf.c_str(), kPageSize));
 
   // Truncate down, then up.
@@ -1371,125 +1350,68 @@ TEST_F(MMapFileTest, WritePrivate) {
               EqualsMemory(std::string(len, '\0')));
 }
 
-// SIGBUS raised when writing past end of file to a private mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, SigBusDeathWritePrivate) {
+// SIGBUS raised when reading or writing past end of a mapped file.
+TEST_P(MMapFileParamTest, SigBusDeath) {
   SetupGvisorDeathTest();
 
   uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
-                         fd_.get(), 0),
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
               SyscallSucceeds());
 
-  // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
-  // accessible. Write just beyond that.
-  size_t len = strlen(kFileContents);
-  EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
-                        reinterpret_cast<volatile char*>(addr + kPageSize)),
-              ::testing::KilledBySignal(SIGBUS), "");
-}
-
-// SIGBUS raised when reading past end of file on a shared mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, SigBusDeathReadShared) {
-  SetupGvisorDeathTest();
-
-  uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
-              SyscallSucceeds());
-
-  // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
-  // accessible. Read just beyond that.
-  std::vector<char> in(kPageSize);
-  EXPECT_EXIT(
-      std::copy(reinterpret_cast<volatile char*>(addr + kPageSize),
-                reinterpret_cast<volatile char*>(addr + kPageSize) + kPageSize,
-                in.data()),
-      ::testing::KilledBySignal(SIGBUS), "");
+  auto* start = reinterpret_cast<volatile char*>(addr + kPageSize);
+
+  // MMapFileTest makes a file kPageSize/2 long. The entire first page should be
+  // accessible, but anything beyond it should not.
+  if (prot() & PROT_WRITE) {
+    // Write beyond first page.
+    size_t len = strlen(kFileContents);
+    EXPECT_EXIT(std::copy(kFileContents, kFileContents + len, start),
+                ::testing::KilledBySignal(SIGBUS), "");
+  } else {
+    // Read beyond first page.
+    std::vector<char> in(kPageSize);
+    EXPECT_EXIT(std::copy(start, start + kPageSize, in.data()),
+                ::testing::KilledBySignal(SIGBUS), "");
+  }
 }
 
-// SIGBUS raised when reading past end of file on a shared mapping.
+// Tests that SIGBUS is not raised when reading or writing to a file-mapped
+// page before EOF, even if part of the mapping extends beyond EOF.
 //
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, SigBusDeathWriteShared) {
-  SetupGvisorDeathTest();
-
+// See b/27877699.
+TEST_P(MMapFileParamTest, NoSigBusOnPagesBeforeEOF) {
   uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
-                         fd_.get(), 0),
-              SyscallSucceeds());
-
-  // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
-  // accessible. Write just beyond that.
-  size_t len = strlen(kFileContents);
-  EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
-                        reinterpret_cast<volatile char*>(addr + kPageSize)),
-              ::testing::KilledBySignal(SIGBUS), "");
-}
-
-// Tests that SIGBUS is not raised when writing to a file-mapped page before
-// EOF, even if part of the mapping extends beyond EOF.
-TEST_F(MMapFileTest, NoSigBusOnPagesBeforeEOF) {
-  uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
-                         fd_.get(), 0),
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
               SyscallSucceeds());
 
   // The test passes if this survives.
-  size_t len = strlen(kFileContents);
-  std::copy(kFileContents, kFileContents + len,
-            reinterpret_cast<volatile char*>(addr));
-}
-
-// Tests that SIGBUS is not raised when writing to a file-mapped page containing
-// EOF, *after* the EOF for a private mapping.
-TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWritePrivate) {
-  uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
-                         fd_.get(), 0),
-              SyscallSucceeds());
-
-  // The test passes if this survives. (Technically addr+kPageSize/2 is already
-  // beyond EOF, but +1 to check for fencepost errors.)
-  size_t len = strlen(kFileContents);
-  std::copy(kFileContents, kFileContents + len,
-            reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1));
-}
-
-// Tests that SIGBUS is not raised when reading from a file-mapped page
-// containing EOF, *after* the EOF for a shared mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFReadShared) {
-  uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
-              SyscallSucceeds());
-
-  // The test passes if this survives. (Technically addr+kPageSize/2 is already
-  // beyond EOF, but +1 to check for fencepost errors.)
   auto* start = reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1);
   size_t len = strlen(kFileContents);
-  std::vector<char> in(len);
-  std::copy(start, start + len, in.data());
+  if (prot() & PROT_WRITE) {
+    std::copy(kFileContents, kFileContents + len, start);
+  } else {
+    std::vector<char> in(len);
+    std::copy(start, start + len, in.data());
+  }
 }
 
-// Tests that SIGBUS is not raised when writing to a file-mapped page containing
-// EOF, *after* the EOF for a shared mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWriteShared) {
+// Tests that SIGBUS is not raised when reading or writing from a file-mapped
+// page containing EOF, *after* the EOF.
+TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) {
   uintptr_t addr;
-  ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
-                         fd_.get(), 0),
+  ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
               SyscallSucceeds());
 
   // The test passes if this survives. (Technically addr+kPageSize/2 is already
   // beyond EOF, but +1 to check for fencepost errors.)
+  auto* start = reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1);
   size_t len = strlen(kFileContents);
-  std::copy(kFileContents, kFileContents + len,
-            reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1));
+  if (prot() & PROT_WRITE) {
+    std::copy(kFileContents, kFileContents + len, start);
+  } else {
+    std::vector<char> in(len);
+    std::copy(start, start + len, in.data());
+  }
 }
 
 // Tests that reading from writable shared file-mapped pages succeeds.
@@ -1733,6 +1655,15 @@ TEST(MMapNoFixtureTest, Map32Bit) {
 
 #endif  // defined(__x86_64__)
 
+INSTANTIATE_TEST_SUITE_P(
+    ReadWriteSharedPrivate, MMapFileParamTest,
+    ::testing::Combine(::testing::ValuesIn({
+                           PROT_READ,
+                           PROT_WRITE,
+                           PROT_READ | PROT_WRITE,
+                       }),
+                       ::testing::ValuesIn({MAP_SHARED, MAP_PRIVATE})));
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 29955a4797e8264f75886a989dbc81b2b5443f4c Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 19 Dec 2019 17:25:18 -0800
Subject: futex: wake one waiter if futex_wake is called with a non-positive
 value

This change is needed to be compatible with the Linux kernel.

There is no glibc wrapper for the futex system call, so it is easy to
make a mistake and call syscall(__NR_futex, FUTEX_WAKE, addr) without
the fourth argument. This works on Linux, because it wakes one waiter
even if val is nonpositive.

PiperOrigin-RevId: 286494396
---
 pkg/sentry/syscalls/linux/sys_futex.go | 10 ++++++++++
 test/syscalls/linux/futex.cc           | 21 +++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index b9bd25464..bde17a767 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -226,6 +226,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if mask == 0 {
 			return 0, nil, syserror.EINVAL
 		}
+		if val <= 0 {
+			// The Linux kernel wakes one waiter even if val is
+			// non-positive.
+			val = 1
+		}
 		n, err := t.Futex().Wake(t, addr, private, mask, val)
 		return uintptr(n), nil, err
 
@@ -242,6 +247,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	case linux.FUTEX_WAKE_OP:
 		op := uint32(val3)
+		if val <= 0 {
+			// The Linux kernel wakes one waiter even if val is
+			// non-positive.
+			val = 1
+		}
 		n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
 		return uintptr(n), nil, err
 
diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc
index d3e3f998c..40c80a6e1 100644
--- a/test/syscalls/linux/futex.cc
+++ b/test/syscalls/linux/futex.cc
@@ -239,6 +239,27 @@ TEST_P(PrivateAndSharedFutexTest, Wake1_NoRandomSave) {
   EXPECT_THAT(futex_wake(IsPrivate(), &a, 1), SyscallSucceedsWithValue(1));
 }
 
+TEST_P(PrivateAndSharedFutexTest, Wake0_NoRandomSave) {
+  constexpr int kInitialValue = 1;
+  std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+  // Prevent save/restore from interrupting futex_wait, which will cause it to
+  // return EAGAIN instead of the expected result if futex_wait is restarted
+  // after we change the value of a below.
+  DisableSave ds;
+  ScopedThread thread([&] {
+    EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue),
+                SyscallSucceedsWithValue(0));
+  });
+  absl::SleepFor(kWaiterStartupDelay);
+
+  // Change a so that if futex_wake happens before futex_wait, the latter
+  // returns EAGAIN instead of hanging the test.
+  a.fetch_add(1);
+  // The Linux kernel wakes one waiter even if val is 0 or negative.
+  EXPECT_THAT(futex_wake(IsPrivate(), &a, 0), SyscallSucceedsWithValue(1));
+}
+
 TEST_P(PrivateAndSharedFutexTest, WakeAll_NoRandomSave) {
   constexpr int kInitialValue = 1;
   std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
-- 
cgit v1.2.3


From e013c48c78c9a7daf245b7de9563e3a0bd8a1e97 Mon Sep 17 00:00:00 2001
From: Ryan Heacock <rheacock@google.com>
Date: Tue, 24 Dec 2019 08:48:14 -0800
Subject: Enable IP_RECVTOS socket option for datagram sockets

Added the ability to get/set the IP_RECVTOS socket option on UDP endpoints. If
enabled, TOS from the incoming Network Header passed as ancillary data in the
ControlMessages.

Test:
* Added unit test to udp_test.go that tests getting/setting as well as
verifying that we receive expected TOS from incoming packet.
* Added a syscall test
PiperOrigin-RevId: 287029703
---
 pkg/sentry/socket/control/control.go         |  2 +-
 pkg/sentry/socket/netstack/netstack.go       | 42 ++++++++++++++++-
 pkg/tcpip/checker/checker.go                 | 16 +++++++
 pkg/tcpip/stack/nic.go                       |  2 +-
 pkg/tcpip/stack/stack.go                     |  2 +-
 pkg/tcpip/tcpip.go                           |  6 ++-
 pkg/tcpip/transport/raw/endpoint.go          |  2 +-
 pkg/tcpip/transport/udp/endpoint.go          | 31 ++++++++++++-
 pkg/tcpip/transport/udp/udp_test.go          | 69 ++++++++++++++++++++++++----
 test/syscalls/linux/socket_ip_udp_generic.cc | 40 ++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.cc |  8 ++--
 11 files changed, 201 insertions(+), 19 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index af1a4e95f..b649dd021 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -327,7 +327,7 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte {
 }
 
 // PackTOS packs an IP_TOS socket control message.
-func PackTOS(t *kernel.Task, tos int8, buf []byte) []byte {
+func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IP,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 140851c17..d2f263402 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1323,6 +1323,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return int32(v), nil
 
+	case linux.IP_RECVTOS:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.ReceiveTOSOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		if v {
+			return int32(1), nil
+		}
+		return int32(0), nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1808,6 +1823,16 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
 
+	case linux.IP_RECVTOS:
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOpt(
+			tcpip.ReceiveTOSOption(v != 0),
+		))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1828,7 +1853,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
 		linux.IP_RECVORIGDSTADDR,
-		linux.IP_RECVTOS,
 		linux.IP_RECVTTL,
 		linux.IP_RETOPTS,
 		linux.IP_TRANSPARENT,
@@ -2139,6 +2163,21 @@ func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) {
 	cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed)
 }
 
+func (s *SocketOperations) fillCmsgTOS(cmsg *socket.ControlMessages) {
+	if s.skType != linux.SOCK_DGRAM {
+		return
+	}
+	var receiveTOS tcpip.ReceiveTOSOption
+	if err := s.Endpoint.GetSockOpt(&receiveTOS); err != nil {
+		return
+	}
+	if !receiveTOS {
+		return
+	}
+	cmsg.IP.HasTOS = s.readCM.HasTOS
+	cmsg.IP.TOS = s.readCM.TOS
+}
+
 // nonBlockingRead issues a non-blocking read.
 //
 // TODO(b/78348848): Support timestamps for stream sockets.
@@ -2244,6 +2283,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 
 	cmsg := s.controlMessages()
 	s.fillCmsgInq(&cmsg)
+	s.fillCmsgTOS(&cmsg)
 	return n, flags, addr, addrLen, cmsg, syserr.FromError(err)
 }
 
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 2f15bf1f1..542abc99d 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -33,6 +33,9 @@ type NetworkChecker func(*testing.T, []header.Network)
 // TransportChecker is a function to check a property of a transport packet.
 type TransportChecker func(*testing.T, header.Transport)
 
+// ControlMessagesChecker is a function to check a property of ancillary data.
+type ControlMessagesChecker func(*testing.T, tcpip.ControlMessages)
+
 // IPv4 checks the validity and properties of the given IPv4 packet. It is
 // expected to be used in conjunction with other network checkers for specific
 // properties. For example, to check the source and destination address, one
@@ -158,6 +161,19 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
+// ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
+func ReceiveTOS(want uint8) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTOS {
+			t.Fatalf("got cm.HasTOS = %t, want cm.TOS = %d", cm.HasTOS, want)
+		}
+		if got := cm.TOS; got != want {
+			t.Fatalf("got cm.TOS = %d, want %d", got, want)
+		}
+	}
+}
+
 // TOS creates a checker that checks the TOS field.
 func TOS(tos uint8, label uint32) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index ddd014658..a4556674b 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -575,7 +575,7 @@ func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
 	n.mu.Unlock()
 }
 
-// Subnets returns the Subnets associated with this NIC.
+// AddressRanges returns the Subnets associated with this NIC.
 func (n *NIC) AddressRanges() []tcpip.Subnet {
 	n.mu.RLock()
 	defer n.mu.RUnlock()
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 7a9600679..251336224 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -829,7 +829,7 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	return false
 }
 
-// NICSubnets returns a map of NICIDs to their associated subnets.
+// NICAddressRanges returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f62fd729f..5c7b2af88 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -322,7 +322,7 @@ type ControlMessages struct {
 	HasTOS bool
 
 	// TOS is the IPv4 type of service of the associated packet.
-	TOS int8
+	TOS uint8
 
 	// HasTClass indicates whether Tclass is valid/set.
 	HasTClass bool
@@ -666,6 +666,10 @@ type IPv4TOSOption uint8
 // for all subsequent outgoing IPv6 packets from the endpoint.
 type IPv6TrafficClassOption uint8
 
+// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
+// ancillary message is passed with incoming packets.
+type ReceiveTOSOption bool
+
 // Route is a row in the routing table. It specifies through which NIC (and
 // gateway) sets of packets should be routed. A row is considered viable if the
 // masked target address matches the destination address in the row.
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 5aafe2615..6d23ab5a1 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -510,7 +510,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 1ac4705af..269470ed4 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -32,6 +32,7 @@ type udpPacket struct {
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
+	tos           uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -114,6 +115,10 @@ type endpoint struct {
 	// applied while sending packets. Defaults to 0 as on Linux.
 	sendTOS uint8
 
+	// receiveTOS determines if the incoming IPv4 TOS header field is passed
+	// as ancillary data to ControlMessages on Read.
+	receiveTOS bool
+
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -244,7 +249,12 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		*addr = p.senderAddress
 	}
 
-	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
+	return p.data.ToView(), tcpip.ControlMessages{
+		HasTimestamp: true,
+		Timestamp:    p.timestamp,
+		HasTOS:       e.receiveTOS,
+		TOS:          p.tos,
+	}, nil
 }
 
 // prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -656,6 +666,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.sendTOS = uint8(v)
 		e.mu.Unlock()
 		return nil
+
+	case tcpip.ReceiveTOSOption:
+		e.mu.Lock()
+		e.receiveTOS = bool(v)
+		e.mu.Unlock()
+		return nil
 	}
 	return nil
 }
@@ -792,6 +808,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.RUnlock()
 		return nil
 
+	case *tcpip.ReceiveTOSOption:
+		e.mu.RLock()
+		*o = tcpip.ReceiveTOSOption(e.receiveTOS)
+		e.mu.RUnlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -1238,6 +1260,13 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	e.rcvList.PushBack(packet)
 	e.rcvBufSize += pkt.Data.Size()
 
+	// Save any useful information from the NetworkHeader to the packet.
+	switch r.NetProto {
+	case header.IPv4ProtocolNumber:
+		// This packet has already been validated before being passed up the stack.
+		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+	}
+
 	packet.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 7051a7a9c..43b8b35ba 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -56,6 +56,7 @@ const (
 	multicastAddr   = "\xe8\x2b\xd3\xea"
 	multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	broadcastAddr   = header.IPv4Broadcast
+	testTOS         = 0x80
 
 	// defaultMTU is the MTU, in bytes, used throughout the tests, except
 	// where another value is explicitly used. It is chosen to match the MTU
@@ -453,6 +454,7 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 	ip := header.IPv4(buf)
 	ip.Encode(&header.IPv4Fields{
 		IHL:         header.IPv4MinimumSize,
+		TOS:         testTOS,
 		TotalLength: uint16(len(buf)),
 		TTL:         65,
 		Protocol:    uint8(udp.ProtocolNumber),
@@ -556,8 +558,8 @@ func TestBindToDeviceOption(t *testing.T) {
 // testReadInternal sends a packet of the given test flow into the stack by
 // injecting it into the link endpoint. It then attempts to read it from the
 // UDP endpoint and depending on if this was expected to succeed verifies its
-// correctness.
-func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool) {
+// correctness  including any additional checker functions provided.
+func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool, checkers ...checker.ControlMessagesChecker) {
 	c.t.Helper()
 
 	payload := newPayload()
@@ -572,12 +574,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
 
 	var addr tcpip.FullAddress
-	v, _, err := c.ep.Read(&addr)
+	v, cm, err := c.ep.Read(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for data to become available.
 		select {
 		case <-ch:
-			v, _, err = c.ep.Read(&addr)
+			v, cm, err = c.ep.Read(&addr)
 
 		case <-time.After(300 * time.Millisecond):
 			if packetShouldBeDropped {
@@ -610,15 +612,21 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	if !bytes.Equal(payload, v) {
 		c.t.Fatalf("bad payload: got %x, want %x", v, payload)
 	}
+
+	// Run any checkers against the ControlMessages.
+	for _, f := range checkers {
+		f(c.t, cm)
+	}
+
 	c.checkEndpointReadStats(1, epstats, err)
 }
 
 // testRead sends a packet of the given test flow into the stack by injecting it
 // into the link endpoint. It then reads it from the UDP endpoint and verifies
-// its correctness.
-func testRead(c *testContext, flow testFlow) {
+// its correctness including any additional checker functions provided.
+func testRead(c *testContext, flow testFlow, checkers ...checker.ControlMessagesChecker) {
 	c.t.Helper()
-	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */)
+	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */, checkers...)
 }
 
 // testFailingRead sends a packet of the given test flow into the stack by
@@ -1286,7 +1294,7 @@ func TestTOSV4(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = 0xC0
+			const tos = testTOS
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1321,7 +1329,7 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = 0xC0
+			const tos = testTOS
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1348,6 +1356,49 @@ func TestTOSV6(t *testing.T) {
 	}
 }
 
+func TestReceiveTOSV4(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, broadcast} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Verify that setting and reading the option works.
+			const recvTos = true
+			var v tcpip.ReceiveTOSOption
+			if err := c.ep.GetSockOpt(&v); err != nil {
+				c.t.Errorf("GetSockopt failed: %s", err)
+			}
+			// Test for expected default value.
+			if v != false {
+				c.t.Errorf("got GetSockOpt(...) = %t, want = %t", v, false)
+			}
+
+			if err := c.ep.SetSockOpt(tcpip.ReceiveTOSOption(recvTos)); err != nil {
+				c.t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.ReceiveTOSOption(recvTos), err)
+			}
+
+			if err := c.ep.GetSockOpt(&v); err != nil {
+				c.t.Errorf("GetSockopt failed: %s", err)
+			}
+
+			if want := tcpip.ReceiveTOSOption(recvTos); v != want {
+				c.t.Errorf("got GetSockOpt(...) = %t, want = %t", v, want)
+			}
+
+			// Bind to wildcard.
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				c.t.Fatalf("Bind failed: %s", err)
+			}
+
+			// Verify that the correct received TOS is actually handed through as
+			// ancillary data to the ControlMessages struct.
+			testRead(c, flow, checker.ReceiveTOS(testTOS))
+		})
+	}
+}
+
 func TestMulticastInterfaceOption(t *testing.T) {
 	for _, flow := range []testFlow{multicastV4, multicastV4in6, multicastV6, multicastV6Only} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 66eb68857..53290bed7 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -209,6 +209,46 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
+// Ensure that Receiving TOS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS works as expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index dc35c2f50..68e0a8109 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,8 +1349,9 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
+          !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1421,7 +1422,8 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From 87e4d03fdf576348ac7023c599e0fc66ad4cccbd Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 26 Dec 2019 13:04:14 -0800
Subject: Automated rollback of changelist 287029703

PiperOrigin-RevId: 287217899
---
 pkg/sentry/socket/control/control.go         |  2 +-
 pkg/sentry/socket/netstack/netstack.go       | 42 +----------------
 pkg/tcpip/checker/checker.go                 | 16 -------
 pkg/tcpip/stack/nic.go                       |  2 +-
 pkg/tcpip/stack/stack.go                     |  2 +-
 pkg/tcpip/tcpip.go                           |  6 +--
 pkg/tcpip/transport/raw/endpoint.go          |  2 +-
 pkg/tcpip/transport/udp/endpoint.go          | 31 +------------
 pkg/tcpip/transport/udp/udp_test.go          | 69 ++++------------------------
 test/syscalls/linux/socket_ip_udp_generic.cc | 40 ----------------
 test/syscalls/linux/udp_socket_test_cases.cc |  8 ++--
 11 files changed, 19 insertions(+), 201 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index b649dd021..af1a4e95f 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -327,7 +327,7 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte {
 }
 
 // PackTOS packs an IP_TOS socket control message.
-func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
+func PackTOS(t *kernel.Task, tos int8, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IP,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index d2f263402..140851c17 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1323,21 +1323,6 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return int32(v), nil
 
-	case linux.IP_RECVTOS:
-		if outLen < sizeOfInt32 {
-			return nil, syserr.ErrInvalidArgument
-		}
-
-		var v tcpip.ReceiveTOSOption
-		if err := ep.GetSockOpt(&v); err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		if v {
-			return int32(1), nil
-		}
-		return int32(0), nil
-
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1823,16 +1808,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
 
-	case linux.IP_RECVTOS:
-		v, err := parseIntOrChar(optVal)
-		if err != nil {
-			return err
-		}
-
-		return syserr.TranslateNetstackError(ep.SetSockOpt(
-			tcpip.ReceiveTOSOption(v != 0),
-		))
-
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1853,6 +1828,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
 		linux.IP_RECVORIGDSTADDR,
+		linux.IP_RECVTOS,
 		linux.IP_RECVTTL,
 		linux.IP_RETOPTS,
 		linux.IP_TRANSPARENT,
@@ -2163,21 +2139,6 @@ func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) {
 	cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed)
 }
 
-func (s *SocketOperations) fillCmsgTOS(cmsg *socket.ControlMessages) {
-	if s.skType != linux.SOCK_DGRAM {
-		return
-	}
-	var receiveTOS tcpip.ReceiveTOSOption
-	if err := s.Endpoint.GetSockOpt(&receiveTOS); err != nil {
-		return
-	}
-	if !receiveTOS {
-		return
-	}
-	cmsg.IP.HasTOS = s.readCM.HasTOS
-	cmsg.IP.TOS = s.readCM.TOS
-}
-
 // nonBlockingRead issues a non-blocking read.
 //
 // TODO(b/78348848): Support timestamps for stream sockets.
@@ -2283,7 +2244,6 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 
 	cmsg := s.controlMessages()
 	s.fillCmsgInq(&cmsg)
-	s.fillCmsgTOS(&cmsg)
 	return n, flags, addr, addrLen, cmsg, syserr.FromError(err)
 }
 
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 542abc99d..2f15bf1f1 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -33,9 +33,6 @@ type NetworkChecker func(*testing.T, []header.Network)
 // TransportChecker is a function to check a property of a transport packet.
 type TransportChecker func(*testing.T, header.Transport)
 
-// ControlMessagesChecker is a function to check a property of ancillary data.
-type ControlMessagesChecker func(*testing.T, tcpip.ControlMessages)
-
 // IPv4 checks the validity and properties of the given IPv4 packet. It is
 // expected to be used in conjunction with other network checkers for specific
 // properties. For example, to check the source and destination address, one
@@ -161,19 +158,6 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
-// ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
-func ReceiveTOS(want uint8) ControlMessagesChecker {
-	return func(t *testing.T, cm tcpip.ControlMessages) {
-		t.Helper()
-		if !cm.HasTOS {
-			t.Fatalf("got cm.HasTOS = %t, want cm.TOS = %d", cm.HasTOS, want)
-		}
-		if got := cm.TOS; got != want {
-			t.Fatalf("got cm.TOS = %d, want %d", got, want)
-		}
-	}
-}
-
 // TOS creates a checker that checks the TOS field.
 func TOS(tos uint8, label uint32) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index a4556674b..ddd014658 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -575,7 +575,7 @@ func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
 	n.mu.Unlock()
 }
 
-// AddressRanges returns the Subnets associated with this NIC.
+// Subnets returns the Subnets associated with this NIC.
 func (n *NIC) AddressRanges() []tcpip.Subnet {
 	n.mu.RLock()
 	defer n.mu.RUnlock()
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 251336224..7a9600679 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -829,7 +829,7 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	return false
 }
 
-// NICAddressRanges returns a map of NICIDs to their associated subnets.
+// NICSubnets returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 5c7b2af88..f62fd729f 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -322,7 +322,7 @@ type ControlMessages struct {
 	HasTOS bool
 
 	// TOS is the IPv4 type of service of the associated packet.
-	TOS uint8
+	TOS int8
 
 	// HasTClass indicates whether Tclass is valid/set.
 	HasTClass bool
@@ -666,10 +666,6 @@ type IPv4TOSOption uint8
 // for all subsequent outgoing IPv6 packets from the endpoint.
 type IPv6TrafficClassOption uint8
 
-// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
-// ancillary message is passed with incoming packets.
-type ReceiveTOSOption bool
-
 // Route is a row in the routing table. It specifies through which NIC (and
 // gateway) sets of packets should be routed. A row is considered viable if the
 // masked target address matches the destination address in the row.
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 6d23ab5a1..5aafe2615 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -510,7 +510,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 269470ed4..1ac4705af 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -32,7 +32,6 @@ type udpPacket struct {
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	tos           uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -115,10 +114,6 @@ type endpoint struct {
 	// applied while sending packets. Defaults to 0 as on Linux.
 	sendTOS uint8
 
-	// receiveTOS determines if the incoming IPv4 TOS header field is passed
-	// as ancillary data to ControlMessages on Read.
-	receiveTOS bool
-
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -249,12 +244,7 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		*addr = p.senderAddress
 	}
 
-	return p.data.ToView(), tcpip.ControlMessages{
-		HasTimestamp: true,
-		Timestamp:    p.timestamp,
-		HasTOS:       e.receiveTOS,
-		TOS:          p.tos,
-	}, nil
+	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
 }
 
 // prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -666,12 +656,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.sendTOS = uint8(v)
 		e.mu.Unlock()
 		return nil
-
-	case tcpip.ReceiveTOSOption:
-		e.mu.Lock()
-		e.receiveTOS = bool(v)
-		e.mu.Unlock()
-		return nil
 	}
 	return nil
 }
@@ -808,12 +792,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.RUnlock()
 		return nil
 
-	case *tcpip.ReceiveTOSOption:
-		e.mu.RLock()
-		*o = tcpip.ReceiveTOSOption(e.receiveTOS)
-		e.mu.RUnlock()
-		return nil
-
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -1260,13 +1238,6 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	e.rcvList.PushBack(packet)
 	e.rcvBufSize += pkt.Data.Size()
 
-	// Save any useful information from the NetworkHeader to the packet.
-	switch r.NetProto {
-	case header.IPv4ProtocolNumber:
-		// This packet has already been validated before being passed up the stack.
-		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
-	}
-
 	packet.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 43b8b35ba..7051a7a9c 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -56,7 +56,6 @@ const (
 	multicastAddr   = "\xe8\x2b\xd3\xea"
 	multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	broadcastAddr   = header.IPv4Broadcast
-	testTOS         = 0x80
 
 	// defaultMTU is the MTU, in bytes, used throughout the tests, except
 	// where another value is explicitly used. It is chosen to match the MTU
@@ -454,7 +453,6 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 	ip := header.IPv4(buf)
 	ip.Encode(&header.IPv4Fields{
 		IHL:         header.IPv4MinimumSize,
-		TOS:         testTOS,
 		TotalLength: uint16(len(buf)),
 		TTL:         65,
 		Protocol:    uint8(udp.ProtocolNumber),
@@ -558,8 +556,8 @@ func TestBindToDeviceOption(t *testing.T) {
 // testReadInternal sends a packet of the given test flow into the stack by
 // injecting it into the link endpoint. It then attempts to read it from the
 // UDP endpoint and depending on if this was expected to succeed verifies its
-// correctness  including any additional checker functions provided.
-func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool, checkers ...checker.ControlMessagesChecker) {
+// correctness.
+func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool) {
 	c.t.Helper()
 
 	payload := newPayload()
@@ -574,12 +572,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
 
 	var addr tcpip.FullAddress
-	v, cm, err := c.ep.Read(&addr)
+	v, _, err := c.ep.Read(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for data to become available.
 		select {
 		case <-ch:
-			v, cm, err = c.ep.Read(&addr)
+			v, _, err = c.ep.Read(&addr)
 
 		case <-time.After(300 * time.Millisecond):
 			if packetShouldBeDropped {
@@ -612,21 +610,15 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	if !bytes.Equal(payload, v) {
 		c.t.Fatalf("bad payload: got %x, want %x", v, payload)
 	}
-
-	// Run any checkers against the ControlMessages.
-	for _, f := range checkers {
-		f(c.t, cm)
-	}
-
 	c.checkEndpointReadStats(1, epstats, err)
 }
 
 // testRead sends a packet of the given test flow into the stack by injecting it
 // into the link endpoint. It then reads it from the UDP endpoint and verifies
-// its correctness including any additional checker functions provided.
-func testRead(c *testContext, flow testFlow, checkers ...checker.ControlMessagesChecker) {
+// its correctness.
+func testRead(c *testContext, flow testFlow) {
 	c.t.Helper()
-	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */, checkers...)
+	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */)
 }
 
 // testFailingRead sends a packet of the given test flow into the stack by
@@ -1294,7 +1286,7 @@ func TestTOSV4(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = testTOS
+			const tos = 0xC0
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1329,7 +1321,7 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = testTOS
+			const tos = 0xC0
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1356,49 +1348,6 @@ func TestTOSV6(t *testing.T) {
 	}
 }
 
-func TestReceiveTOSV4(t *testing.T) {
-	for _, flow := range []testFlow{unicastV4, broadcast} {
-		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
-			c := newDualTestContext(t, defaultMTU)
-			defer c.cleanup()
-
-			c.createEndpointForFlow(flow)
-
-			// Verify that setting and reading the option works.
-			const recvTos = true
-			var v tcpip.ReceiveTOSOption
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
-			}
-			// Test for expected default value.
-			if v != false {
-				c.t.Errorf("got GetSockOpt(...) = %t, want = %t", v, false)
-			}
-
-			if err := c.ep.SetSockOpt(tcpip.ReceiveTOSOption(recvTos)); err != nil {
-				c.t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.ReceiveTOSOption(recvTos), err)
-			}
-
-			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
-			}
-
-			if want := tcpip.ReceiveTOSOption(recvTos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %t, want = %t", v, want)
-			}
-
-			// Bind to wildcard.
-			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-				c.t.Fatalf("Bind failed: %s", err)
-			}
-
-			// Verify that the correct received TOS is actually handed through as
-			// ancillary data to the ControlMessages struct.
-			testRead(c, flow, checker.ReceiveTOS(testTOS))
-		})
-	}
-}
-
 func TestMulticastInterfaceOption(t *testing.T) {
 	for _, flow := range []testFlow{multicastV4, multicastV4in6, multicastV6, multicastV6Only} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 53290bed7..66eb68857 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -209,46 +209,6 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
-// Ensure that Receiving TOS is off by default.
-TEST_P(UDPSocketPairTest, RecvTosDefault) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-}
-
-// Test that setting and getting IP_RECVTOS works as expected.
-TEST_P(UDPSocketPairTest, SetRecvTos) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOff, sizeof(kSockOptOff)),
-              SyscallSucceeds());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOn, sizeof(kSockOptOn)),
-              SyscallSucceeds());
-
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOn);
-}
-
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 68e0a8109..dc35c2f50 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,9 +1349,8 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
-          !IsRunningWithHostinet());
+  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1422,8 +1421,7 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
-  // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
+  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From e77ad574233b779519a253c6f58197c339e9100a Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 7 Jan 2020 17:25:18 -0800
Subject: Fix partial_bad_buffer write tests.

The write tests are fitted to Linux-specific behavior, but it is not
well-specified. Tweak the tests to allow for both acceptable outcomes.

PiperOrigin-RevId: 288606386
---
 test/syscalls/linux/partial_bad_buffer.cc | 138 ++++++++++++++----------------
 1 file changed, 64 insertions(+), 74 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc
index 33822ee57..df7129acc 100644
--- a/test/syscalls/linux/partial_bad_buffer.cc
+++ b/test/syscalls/linux/partial_bad_buffer.cc
@@ -18,7 +18,9 @@
 #include <netinet/tcp.h>
 #include <sys/mman.h>
 #include <sys/socket.h>
+#include <sys/stat.h>
 #include <sys/syscall.h>
+#include <sys/types.h>
 #include <sys/uio.h>
 #include <unistd.h>
 
@@ -62,9 +64,9 @@ class PartialBadBufferTest : public ::testing::Test {
     // Write some initial data.
     size_t size = sizeof(kMessage) - 1;
     EXPECT_THAT(WriteFd(fd_, &kMessage, size), SyscallSucceedsWithValue(size));
-
     ASSERT_THAT(lseek(fd_, 0, SEEK_SET), SyscallSucceeds());
 
+    // Map a useable buffer.
     addr_ = mmap(0, 2 * kPageSize, PROT_READ | PROT_WRITE,
                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
     ASSERT_NE(addr_, MAP_FAILED);
@@ -79,6 +81,15 @@ class PartialBadBufferTest : public ::testing::Test {
     bad_buffer_ = buf + kPageSize - 1;
   }
 
+  off_t Size() {
+    struct stat st;
+    int rc = fstat(fd_, &st);
+    if (rc < 0) {
+      return static_cast<off_t>(rc);
+    }
+    return st.st_size;
+  }
+
   void TearDown() override {
     EXPECT_THAT(munmap(addr_, 2 * kPageSize), SyscallSucceeds()) << addr_;
     EXPECT_THAT(close(fd_), SyscallSucceeds());
@@ -165,97 +176,99 @@ TEST_F(PartialBadBufferTest, PreadvSmall) {
 }
 
 TEST_F(PartialBadBufferTest, WriteBig) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, kPageSize),
-              SyscallFailsWithErrno(EFAULT));
+  ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(
+      (n = RetryEINTR(write)(fd_, bad_buffer_, kPageSize)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, WriteSmall) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, 10),
-              SyscallFailsWithErrno(EFAULT));
+  ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(
+      (n = RetryEINTR(write)(fd_, bad_buffer_, 10)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, PwriteBig) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(pwrite)(fd_, bad_buffer_, kPageSize, 0),
-              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(
+      (n = RetryEINTR(pwrite)(fd_, bad_buffer_, kPageSize, orig_size)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, PwriteSmall) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(pwrite)(fd_, bad_buffer_, 10, 0),
-              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(
+      (n = RetryEINTR(pwrite)(fd_, bad_buffer_, 10, orig_size)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, WritevBig) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
-
   struct iovec vec;
   vec.iov_base = bad_buffer_;
   vec.iov_len = kPageSize;
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(writev)(fd_, &vec, 1), SyscallFailsWithErrno(EFAULT));
+  ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(
+      (n = RetryEINTR(writev)(fd_, &vec, 1)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, WritevSmall) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
-
   struct iovec vec;
   vec.iov_base = bad_buffer_;
   vec.iov_len = 10;
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(writev)(fd_, &vec, 1), SyscallFailsWithErrno(EFAULT));
+  ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(
+      (n = RetryEINTR(writev)(fd_, &vec, 1)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, PwritevBig) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
-
   struct iovec vec;
   vec.iov_base = bad_buffer_;
   vec.iov_len = kPageSize;
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(pwritev)(fd_, &vec, 1, 0),
-              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(
+      (n = RetryEINTR(pwritev)(fd_, &vec, 1, orig_size)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 TEST_F(PartialBadBufferTest, PwritevSmall) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
-
   struct iovec vec;
   vec.iov_base = bad_buffer_;
   vec.iov_len = 10;
+  off_t orig_size = Size();
+  int n;
 
-  EXPECT_THAT(RetryEINTR(pwritev)(fd_, &vec, 1, 0),
-              SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(
+      (n = RetryEINTR(pwritev)(fd_, &vec, 1, orig_size)),
+      AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+  EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
 }
 
 // getdents returns EFAULT when the you claim the buffer is large enough, but
@@ -283,29 +296,6 @@ TEST_F(PartialBadBufferTest, GetdentsOneEntry) {
       SyscallSucceedsWithValue(Gt(0)));
 }
 
-// Verify that when write returns EFAULT the kernel hasn't silently written
-// the initial valid bytes.
-TEST_F(PartialBadBufferTest, WriteEfaultIsntPartial) {
-  // FIXME(b/24788078): The sentry write syscalls will return immediately
-  // if Access returns an error, but Access may not return an error
-  // and the sentry will instead perform a partial write.
-  SKIP_IF(IsRunningOnGvisor());
-
-  bad_buffer_[0] = 'A';
-  EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, 10),
-              SyscallFailsWithErrno(EFAULT));
-
-  size_t size = 255;
-  char buf[255];
-  memset(buf, 0, size);
-
-  EXPECT_THAT(RetryEINTR(pread)(fd_, buf, size, 0),
-              SyscallSucceedsWithValue(sizeof(kMessage) - 1));
-
-  // 'A' has not been written.
-  EXPECT_STREQ(buf, kMessage);
-}
-
 PosixErrorOr<sockaddr_storage> InetLoopbackAddr(int family) {
   struct sockaddr_storage addr;
   memset(&addr, 0, sizeof(addr));
-- 
cgit v1.2.3


From a53ac7307abfeb7172e67f48d0a7aaa4b5c3f31e Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 7 Jan 2020 23:52:59 -0800
Subject: fs/splice: don't report a partialResult error if there is no data
 loss

PiperOrigin-RevId: 288642552
---
 pkg/sentry/fs/file.go          |  7 +++++++
 pkg/sentry/fs/splice.go        |  5 +++++
 test/syscalls/linux/inotify.cc | 28 ++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index c0a6e884b..a2f966cb6 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -555,6 +555,10 @@ type lockedWriter struct {
 	//
 	// This applies only to Write, not WriteAt.
 	Offset int64
+
+	// Err contains the first error encountered while copying. This is
+	// useful to determine whether Writer or Reader failed during io.Copy.
+	Err error
 }
 
 // Write implements io.Writer.Write.
@@ -590,5 +594,8 @@ func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
 			break
 		}
 	}
+	if w.Err == nil {
+		w.Err = err
+	}
 	return written, err
 }
diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
index 311798811..389c330a0 100644
--- a/pkg/sentry/fs/splice.go
+++ b/pkg/sentry/fs/splice.go
@@ -167,6 +167,11 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64,
 		if !srcPipe && !opts.SrcOffset {
 			atomic.StoreInt64(&src.offset, src.offset+n)
 		}
+
+		// Don't report any errors if we have some progress without data loss.
+		if w.Err == nil {
+			err = nil
+		}
 	}
 
 	// Drop locks.
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 7384c27dc..59ec9940a 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -1591,6 +1591,34 @@ TEST(Inotify, EpollNoDeadlock) {
   }
 }
 
+TEST(Inotify, SpliceEvent) {
+  int pipes[2];
+  ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
+
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      root.path(), "some content", TempPath::kDefaultFileMode));
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+  const int watcher = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+  char buf;
+  EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+  EXPECT_THAT(splice(fd.get(), nullptr, pipes[1], nullptr,
+                     sizeof(struct inotify_event) + 1, SPLICE_F_NONBLOCK),
+              SyscallSucceedsWithValue(sizeof(struct inotify_event)));
+
+  const FileDescriptor read_fd(pipes[0]);
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(read_fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)}));
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From d01240d871c8737989b1af27c137f6ae40bc6d37 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Wed, 8 Jan 2020 13:52:56 -0800
Subject: Take addresses as const

PiperOrigin-RevId: 288767927
---
 test/syscalls/linux/ip_socket_test_util.cc | 10 +++++-----
 test/syscalls/linux/ip_socket_test_util.h  |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 8398fc95f..6b472eb2f 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -187,24 +187,24 @@ PosixErrorOr<int> IfAddrHelper::GetIndex(std::string name) {
   return InterfaceIndex(name);
 }
 
-std::string GetAddr4Str(in_addr* a) {
+std::string GetAddr4Str(const in_addr* a) {
   char str[INET_ADDRSTRLEN];
   inet_ntop(AF_INET, a, str, sizeof(str));
   return std::string(str);
 }
 
-std::string GetAddr6Str(in6_addr* a) {
+std::string GetAddr6Str(const in6_addr* a) {
   char str[INET6_ADDRSTRLEN];
   inet_ntop(AF_INET6, a, str, sizeof(str));
   return std::string(str);
 }
 
-std::string GetAddrStr(sockaddr* a) {
+std::string GetAddrStr(const sockaddr* a) {
   if (a->sa_family == AF_INET) {
-    auto src = &(reinterpret_cast<sockaddr_in*>(a)->sin_addr);
+    auto src = &(reinterpret_cast<const sockaddr_in*>(a)->sin_addr);
     return GetAddr4Str(src);
   } else if (a->sa_family == AF_INET6) {
-    auto src = &(reinterpret_cast<sockaddr_in6*>(a)->sin6_addr);
+    auto src = &(reinterpret_cast<const sockaddr_in6*>(a)->sin6_addr);
     return GetAddr6Str(src);
   }
   return std::string("<invalid>");
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 9cb4566db..0f58e0f77 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -105,14 +105,14 @@ class IfAddrHelper {
 };
 
 // GetAddr4Str returns the given IPv4 network address structure as a string.
-std::string GetAddr4Str(in_addr* a);
+std::string GetAddr4Str(const in_addr* a);
 
 // GetAddr6Str returns the given IPv6 network address structure as a string.
-std::string GetAddr6Str(in6_addr* a);
+std::string GetAddr6Str(const in6_addr* a);
 
 // GetAddrStr returns the given IPv4 or IPv6 network address structure as a
 // string.
-std::string GetAddrStr(sockaddr* a);
+std::string GetAddrStr(const sockaddr* a);
 
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From b3ae8a62cfdf13821d35467d4150ed983ac556f1 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Wed, 8 Jan 2020 16:29:12 -0800
Subject: Fix slice bounds out of range panic in parsing socket control
 message.

Panic found by syzakller.

PiperOrigin-RevId: 288799046
---
 pkg/sentry/socket/control/control.go     |  6 ++++++
 test/syscalls/linux/socket_ip_unbound.cc | 33 ++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index af1a4e95f..4301b697c 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -471,6 +471,9 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 		case linux.SOL_IP:
 			switch h.Type {
 			case linux.IP_TOS:
+				if length < linux.SizeOfControlMessageTOS {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
 				cmsgs.IP.HasTOS = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
 				i += AlignUp(length, width)
@@ -481,6 +484,9 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 		case linux.SOL_IPV6:
 			switch h.Type {
 			case linux.IPV6_TCLASS:
+				if length < linux.SizeOfControlMessageTClass {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
 				cmsgs.IP.HasTClass = true
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass)
 				i += AlignUp(length, width)
diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc
index b6754111f..ca597e267 100644
--- a/test/syscalls/linux/socket_ip_unbound.cc
+++ b/test/syscalls/linux/socket_ip_unbound.cc
@@ -129,6 +129,7 @@ TEST_P(IPUnboundSocketTest, InvalidNegativeTtl) {
 struct TOSOption {
   int level;
   int option;
+  int cmsg_level;
 };
 
 constexpr int INET_ECN_MASK = 3;
@@ -139,10 +140,12 @@ static TOSOption GetTOSOption(int domain) {
     case AF_INET:
       opt.level = IPPROTO_IP;
       opt.option = IP_TOS;
+      opt.cmsg_level = SOL_IP;
       break;
     case AF_INET6:
       opt.level = IPPROTO_IPV6;
       opt.option = IPV6_TCLASS;
+      opt.cmsg_level = SOL_IPV6;
       break;
   }
   return opt;
@@ -386,6 +389,36 @@ TEST_P(IPUnboundSocketTest, NullTOS) {
               SyscallFailsWithErrno(EFAULT));
 }
 
+TEST_P(IPUnboundSocketTest, InsufficientBufferTOS) {
+  SKIP_IF(GetParam().protocol == IPPROTO_TCP);
+
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  TOSOption t = GetTOSOption(GetParam().domain);
+
+  in_addr addr4;
+  in6_addr addr6;
+  ASSERT_THAT(inet_pton(AF_INET, "127.0.0.1", &addr4), ::testing::Eq(1));
+  ASSERT_THAT(inet_pton(AF_INET6, "fe80::", &addr6), ::testing::Eq(1));
+
+  cmsghdr cmsg = {};
+  cmsg.cmsg_len = sizeof(cmsg);
+  cmsg.cmsg_level = t.cmsg_level;
+  cmsg.cmsg_type = t.option;
+
+  msghdr msg = {};
+  msg.msg_control = &cmsg;
+  msg.msg_controllen = sizeof(cmsg);
+  if (GetParam().domain == AF_INET) {
+    msg.msg_name = &addr4;
+    msg.msg_namelen = sizeof(addr4);
+  } else {
+    msg.msg_name = &addr6;
+    msg.msg_namelen = sizeof(addr6);
+  }
+
+  EXPECT_THAT(sendmsg(socket->get(), &msg, 0), SyscallFailsWithErrno(EINVAL));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     IPUnboundSockets, IPUnboundSocketTest,
     ::testing::ValuesIn(VecCat<SocketKind>(VecCat<SocketKind>(
-- 
cgit v1.2.3


From fbb2c008e26a7e9d860f6cbf796ea7c375858502 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Wed, 8 Jan 2020 16:35:43 -0800
Subject: Return correct length with MSG_TRUNC for unix sockets.

This change calls a new Truncate method on the EndpointReader in RecvMsg for
both netlink and unix sockets.  This allows readers such as sockets to peek at
the length of data without actually reading it to a buffer.

Fixes #993 #1240

PiperOrigin-RevId: 288800167
---
 pkg/sentry/socket/netlink/BUILD                   |   1 -
 pkg/sentry/socket/netlink/socket.go               |  29 +++---
 pkg/sentry/socket/unix/io.go                      |  13 +++
 pkg/sentry/socket/unix/unix.go                    |  23 ++++-
 test/syscalls/linux/BUILD                         |   1 -
 test/syscalls/linux/socket_non_stream.cc          | 113 +++++++++++++++++++++-
 test/syscalls/linux/socket_non_stream_blocking.cc |  37 +++++++
 test/syscalls/linux/socket_stream.cc              |  55 ++++++++++-
 8 files changed, 250 insertions(+), 22 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 79589e3c8..136821963 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -22,7 +22,6 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix",
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 4a1b87a9a..d2e3644a6 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -29,7 +29,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
@@ -500,29 +499,29 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 	trunc := flags&linux.MSG_TRUNC != 0
 
 	r := unix.EndpointReader{
+		Ctx:      t,
 		Endpoint: s.ep,
 		Peek:     flags&linux.MSG_PEEK != 0,
 	}
 
+	doRead := func() (int64, error) {
+		return dst.CopyOutFrom(t, &r)
+	}
+
 	// If MSG_TRUNC is set with a zero byte destination then we still need
 	// to read the message and discard it, or in the case where MSG_PEEK is
 	// set, leave it be. In both cases the full message length must be
-	// returned. However, the memory manager for the destination will not read
-	// the endpoint if the destination is zero length.
-	//
-	// In order for the endpoint to be read when the destination size is zero,
-	// we must cause a read of the endpoint by using a separate fake zero
-	// length block sequence and calling the EndpointReader directly.
+	// returned.
 	if trunc && dst.Addrs.NumBytes() == 0 {
-		// Perform a read to a zero byte block sequence. We can ignore the
-		// original destination since it was zero bytes. The length returned by
-		// ReadToBlocks is ignored and we return the full message length to comply
-		// with MSG_TRUNC.
-		_, err := r.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(make([]byte, 0))))
-		return int(r.MsgSize), linux.MSG_TRUNC, from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
+		doRead = func() (int64, error) {
+			err := r.Truncate()
+			// Always return zero for bytes read since the destination size is
+			// zero.
+			return 0, err
+		}
 	}
 
-	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+	if n, err := doRead(); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
 		var mflags int
 		if n < int64(r.MsgSize) {
 			mflags |= linux.MSG_TRUNC
@@ -540,7 +539,7 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 	defer s.EventUnregister(&e)
 
 	for {
-		if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
+		if n, err := doRead(); err != syserror.ErrWouldBlock {
 			var mflags int
 			if n < int64(r.MsgSize) {
 				mflags |= linux.MSG_TRUNC
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 2ec1a662d..2447f24ef 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -83,6 +83,19 @@ type EndpointReader struct {
 	ControlTrunc bool
 }
 
+// Truncate calls RecvMsg on the endpoint without writing to a destination.
+func (r *EndpointReader) Truncate() error {
+	// Ignore bytes read since it will always be zero.
+	_, ms, c, ct, err := r.Endpoint.RecvMsg(r.Ctx, [][]byte{}, r.Creds, r.NumRights, r.Peek, r.From)
+	r.Control = c
+	r.ControlTrunc = ct
+	r.MsgSize = ms
+	if err != nil {
+		return err.ToError()
+	}
+	return nil
+}
+
 // ReadToBlocks implements safemem.Reader.ReadToBlocks.
 func (r *EndpointReader) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 	return safemem.FromVecReaderFunc{func(bufs [][]byte) (int64, error) {
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 885758054..91effe89a 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -544,8 +544,27 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	if senderRequested {
 		r.From = &tcpip.FullAddress{}
 	}
+
+	doRead := func() (int64, error) {
+		return dst.CopyOutFrom(t, &r)
+	}
+
+	// If MSG_TRUNC is set with a zero byte destination then we still need
+	// to read the message and discard it, or in the case where MSG_PEEK is
+	// set, leave it be. In both cases the full message length must be
+	// returned.
+	if trunc && dst.Addrs.NumBytes() == 0 {
+		doRead = func() (int64, error) {
+			err := r.Truncate()
+			// Always return zero for bytes read since the destination size is
+			// zero.
+			return 0, err
+		}
+
+	}
+
 	var total int64
-	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait {
+	if n, err := doRead(); err != syserror.ErrWouldBlock || dontWait {
 		var from linux.SockAddr
 		var fromLen uint32
 		if r.From != nil && len([]byte(r.From.Addr)) != 0 {
@@ -580,7 +599,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	defer s.EventUnregister(&e)
 
 	for {
-		if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
+		if n, err := doRead(); err != syserror.ErrWouldBlock {
 			var from linux.SockAddr
 			var fromLen uint32
 			if r.From != nil {
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 064ce8429..ce8abe217 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2888,7 +2888,6 @@ cc_library(
         ":unix_domain_socket_test_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "//test/util:timer_util",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
diff --git a/test/syscalls/linux/socket_non_stream.cc b/test/syscalls/linux/socket_non_stream.cc
index d91c5ed39..c61817f14 100644
--- a/test/syscalls/linux/socket_non_stream.cc
+++ b/test/syscalls/linux/socket_non_stream.cc
@@ -113,7 +113,7 @@ TEST_P(NonStreamSocketPairTest, RecvmsgMsghdrFlagMsgTrunc) {
   EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
 
   // Check that msghdr flags were updated.
-  EXPECT_EQ(msg.msg_flags, MSG_TRUNC);
+  EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
 }
 
 // Stream sockets allow data sent with multiple sends to be peeked at in a
@@ -193,7 +193,7 @@ TEST_P(NonStreamSocketPairTest, MsgTruncTruncationRecvmsgMsghdrFlagMsgTrunc) {
   EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
 
   // Check that msghdr flags were updated.
-  EXPECT_EQ(msg.msg_flags, MSG_TRUNC);
+  EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
 }
 
 TEST_P(NonStreamSocketPairTest, MsgTruncSameSize) {
@@ -224,5 +224,114 @@ TEST_P(NonStreamSocketPairTest, MsgTruncNotFull) {
   EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
 }
 
+// This test tests reading from a socket with MSG_TRUNC and a zero length
+// receive buffer. The user should be able to get the message length.
+TEST_P(NonStreamSocketPairTest, RecvmsgMsgTruncZeroLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  // The receive buffer is of zero length.
+  char received_data[0] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  // The syscall succeeds returning the full size of the message on the socket.
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  // Check that MSG_TRUNC is set on msghdr flags.
+  EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+}
+
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer. The user should be able to get the message length
+// without reading data off the socket.
+TEST_P(NonStreamSocketPairTest, RecvmsgMsgTruncMsgPeekZeroLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  // The receive buffer is of zero length.
+  char peek_data[0] = {};
+
+  struct iovec peek_iov;
+  peek_iov.iov_base = peek_data;
+  peek_iov.iov_len = sizeof(peek_data);
+  struct msghdr peek_msg = {};
+  peek_msg.msg_flags = -1;
+  peek_msg.msg_iov = &peek_iov;
+  peek_msg.msg_iovlen = 1;
+
+  // The syscall succeeds returning the full size of the message on the socket.
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+                                  MSG_TRUNC | MSG_PEEK),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  // Check that MSG_TRUNC is set on msghdr flags because the receive buffer is
+  // smaller than the message size.
+  EXPECT_EQ(peek_msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+
+  char received_data[sizeof(sent_data)] = {};
+
+  struct iovec received_iov;
+  received_iov.iov_base = received_data;
+  received_iov.iov_len = sizeof(received_data);
+  struct msghdr received_msg = {};
+  received_msg.msg_flags = -1;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+
+  // Next we can read the actual data.
+  ASSERT_THAT(
+      RetryEINTR(recvmsg)(sockets->second_fd(), &received_msg, MSG_TRUNC),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+  // Check that MSG_TRUNC is not set on msghdr flags because we read the whole
+  // message.
+  EXPECT_EQ(received_msg.msg_flags & MSG_TRUNC, 0);
+}
+
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer and MSG_DONTWAIT. The user should be able to get an
+// EAGAIN or EWOULDBLOCK error response.
+TEST_P(NonStreamSocketPairTest, RecvmsgTruncPeekDontwaitZeroLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // NOTE: We don't send any data on the socket.
+
+  // The receive buffer is of zero length.
+  char peek_data[0] = {};
+
+  struct iovec peek_iov;
+  peek_iov.iov_base = peek_data;
+  peek_iov.iov_len = sizeof(peek_data);
+  struct msghdr peek_msg = {};
+  peek_msg.msg_flags = -1;
+  peek_msg.msg_iov = &peek_iov;
+  peek_msg.msg_iovlen = 1;
+
+  // recvmsg fails with EAGAIN because no data is available on the socket.
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+                                  MSG_TRUNC | MSG_PEEK | MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_non_stream_blocking.cc b/test/syscalls/linux/socket_non_stream_blocking.cc
index 62d87c1af..b052f6e61 100644
--- a/test/syscalls/linux/socket_non_stream_blocking.cc
+++ b/test/syscalls/linux/socket_non_stream_blocking.cc
@@ -25,6 +25,7 @@
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
+#include "test/util/thread_util.h"
 
 namespace gvisor {
 namespace testing {
@@ -44,5 +45,41 @@ TEST_P(BlockingNonStreamSocketPairTest, RecvLessThanBufferWaitAll) {
               SyscallSucceedsWithValue(sizeof(sent_data)));
 }
 
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer and MSG_DONTWAIT. The recvmsg call should block on
+// reading the data.
+TEST_P(BlockingNonStreamSocketPairTest,
+       RecvmsgTruncPeekDontwaitZeroLenBlocking) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // NOTE: We don't initially send any data on the socket.
+  const int data_size = 10;
+  char sent_data[data_size];
+  RandomizeBuffer(sent_data, data_size);
+
+  // The receive buffer is of zero length.
+  char peek_data[0] = {};
+
+  struct iovec peek_iov;
+  peek_iov.iov_base = peek_data;
+  peek_iov.iov_len = sizeof(peek_data);
+  struct msghdr peek_msg = {};
+  peek_msg.msg_flags = -1;
+  peek_msg.msg_iov = &peek_iov;
+  peek_msg.msg_iovlen = 1;
+
+  ScopedThread t([&]() {
+    // The syscall succeeds returning the full size of the message on the
+    // socket. This should block until there is data on the socket.
+    ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+                                    MSG_TRUNC | MSG_PEEK),
+                SyscallSucceedsWithValue(data_size));
+  });
+
+  absl::SleepFor(absl::Seconds(1));
+  ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), sent_data, data_size, 0),
+              SyscallSucceedsWithValue(data_size));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_stream.cc b/test/syscalls/linux/socket_stream.cc
index 346443f96..6522b2e01 100644
--- a/test/syscalls/linux/socket_stream.cc
+++ b/test/syscalls/linux/socket_stream.cc
@@ -104,7 +104,60 @@ TEST_P(StreamSocketPairTest, RecvmsgMsghdrFlagsNoMsgTrunc) {
   EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
 
   // Check that msghdr flags were cleared (MSG_TRUNC was not set).
-  EXPECT_EQ(msg.msg_flags, 0);
+  ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
+}
+
+TEST_P(StreamSocketPairTest, RecvmsgTruncZeroLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[0] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC),
+              SyscallSucceedsWithValue(0));
+
+  // Check that msghdr flags were cleared (MSG_TRUNC was not set).
+  ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
+}
+
+TEST_P(StreamSocketPairTest, RecvmsgTruncPeekZeroLen) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[0] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(
+      RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC | MSG_PEEK),
+      SyscallSucceedsWithValue(0));
+
+  // Check that msghdr flags were cleared (MSG_TRUNC was not set).
+  ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
 }
 
 TEST_P(StreamSocketPairTest, MsgTrunc) {
-- 
cgit v1.2.3


From 356d81146bafc4b4548163eb87e886c851b49e12 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 9 Jan 2020 17:56:58 -0800
Subject: Deflake a couple of TCP syscall tests when run under gotsan.

PiperOrigin-RevId: 289010316
---
 .../linux/socket_bind_to_device_distribution.cc    | 25 ++++++++++++++++++--
 test/syscalls/linux/socket_inet_loopback.cc        | 27 +++++++++++++++++++---
 2 files changed, 47 insertions(+), 5 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_bind_to_device_distribution.cc b/test/syscalls/linux/socket_bind_to_device_distribution.cc
index 5767181a1..5ed57625c 100644
--- a/test/syscalls/linux/socket_bind_to_device_distribution.cc
+++ b/test/syscalls/linux/socket_bind_to_device_distribution.cc
@@ -183,7 +183,14 @@ TEST_P(BindToDeviceDistributionTest, Tcp) {
             }
             // Receive some data from a socket to be sure that the connect()
             // system call has been completed on another side.
-            int data;
+            // Do a short read and then close the socket to trigger a RST. This
+            // ensures that both ends of the connection are cleaned up and no
+            // goroutines hang around in TIME-WAIT. We do this so that this test
+            // does not timeout under gotsan runs where lots of goroutines can
+            // cause the test to use absurd amounts of memory.
+            //
+            // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
+            uint16_t data;
             EXPECT_THAT(
                 RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
                 SyscallSucceedsWithValue(sizeof(data)));
@@ -198,15 +205,29 @@ TEST_P(BindToDeviceDistributionTest, Tcp) {
   }
 
   for (int i = 0; i < kConnectAttempts; i++) {
-    FileDescriptor const fd = ASSERT_NO_ERRNO_AND_VALUE(
+    const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
         Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
     ASSERT_THAT(
         RetryEINTR(connect)(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
                             connector.addr_len),
         SyscallSucceeds());
 
+    // Do two separate sends to ensure two segments are received. This is
+    // required for netstack where read is incorrectly assuming a whole
+    // segment is read when endpoint.Read() is called which is technically
+    // incorrect as the syscall that invoked endpoint.Read() may only
+    // consume it partially. This results in a case where a close() of
+    // such a socket does not trigger a RST in netstack due to the
+    // endpoint assuming that the endpoint has no unread data.
     EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
                 SyscallSucceedsWithValue(sizeof(i)));
+
+    // TODO(gvisor.dev/issue/1449): Remove this block once netstack correctly
+    //   generates a RST.
+    if (IsRunningOnGvisor()) {
+      EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+                  SyscallSucceedsWithValue(sizeof(i)));
+    }
   }
 
   // Join threads to be sure that all connections have been counted.
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 619d41901..138024d9e 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -714,7 +714,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
   sockaddr_storage listen_addr = listener.addr;
   sockaddr_storage conn_addr = connector.addr;
   constexpr int kThreadCount = 3;
-  constexpr int kConnectAttempts = 4096;
+  constexpr int kConnectAttempts = 10000;
 
   // Create the listening socket.
   FileDescriptor listener_fds[kThreadCount];
@@ -729,7 +729,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
     ASSERT_THAT(
         bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
         SyscallSucceeds());
-    ASSERT_THAT(listen(fd, kConnectAttempts / 3), SyscallSucceeds());
+    ASSERT_THAT(listen(fd, 40), SyscallSucceeds());
 
     // On the first bind we need to determine which port was bound.
     if (i != 0) {
@@ -772,7 +772,14 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
             }
             // Receive some data from a socket to be sure that the connect()
             // system call has been completed on another side.
-            int data;
+            // Do a short read and then close the socket to trigger a RST. This
+            // ensures that both ends of the connection are cleaned up and no
+            // goroutines hang around in TIME-WAIT. We do this so that this test
+            // does not timeout under gotsan runs where lots of goroutines can
+            // cause the test to use absurd amounts of memory.
+            //
+            // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
+            uint16_t data;
             EXPECT_THAT(
                 RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
                 SyscallSucceedsWithValue(sizeof(data)));
@@ -795,8 +802,22 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
                               connector.addr_len),
           SyscallSucceeds());
 
+      // Do two separate sends to ensure two segments are received. This is
+      // required for netstack where read is incorrectly assuming a whole
+      // segment is read when endpoint.Read() is called which is technically
+      // incorrect as the syscall that invoked endpoint.Read() may only
+      // consume it partially. This results in a case where a close() of
+      // such a socket does not trigger a RST in netstack due to the
+      // endpoint assuming that the endpoint has no unread data.
       EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
                   SyscallSucceedsWithValue(sizeof(i)));
+
+      // TODO(gvisor.dev/issue/1449): Remove this block once netstack correctly
+      //   generates a RST.
+      if (IsRunningOnGvisor()) {
+        EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+                    SyscallSucceedsWithValue(sizeof(i)));
+      }
     }
   });
 
-- 
cgit v1.2.3


From ebd25099bfb9ac6af9739dd9a7795aff13f8e34a Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Fri, 10 Jan 2020 16:45:45 +0800
Subject: enable //test/syscalls:proc_test support on Arm64

Problems with different platform architectures have been solved.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/syscalls/linux/proc.cc | 70 +++++++++++++++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 22 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 8cf08991b..66f89ef64 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -102,7 +102,55 @@ namespace {
 
 // O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
 // because "it isn't needed", even though Linux can return it via F_GETFL.
+#if defined(__x86_64__) || defined(__i386__)
 constexpr int kOLargeFile = 00100000;
+#elif __aarch64__
+// The value originate from the Linux
+// kernel's arch/arm64/include/uapi/asm/fcntl.h.
+constexpr int kOLargeFile = 00400000;
+#else
+#error "Unknown architecture"
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
+  // This list of "required" fields is taken from reading the file
+  // arch/x86/kernel/cpu/proc.c and seeing which fields will be unconditionally
+  // printed by the kernel.
+  static const char* required_fields[] = {
+      "processor",
+      "vendor_id",
+      "cpu family",
+      "model\t\t:",
+      "model name",
+      "stepping",
+      "cpu MHz",
+      "fpu\t\t:",
+      "fpu_exception",
+      "cpuid level",
+      "wp",
+      "bogomips",
+      "clflush size",
+      "cache_alignment",
+      "address sizes",
+      "power management",
+  };
+#elif __aarch64__
+  // This list of "required" fields is taken from reading the file
+  // arch/arm64/kernel/cpuinfo.c and seeing which fields will be unconditionally
+  // printed by the kernel.
+  static const char* required_fields[] = {
+      "processor",
+      "BogoMIPS",
+      "Features",
+      "CPU implementer",
+      "CPU architecture",
+      "CPU variant",
+      "CPU part",
+      "CPU revision",
+  };
+#else
+#error "Unknown architecture"
+#endif
 
 // Takes the subprocess command line and pid.
 // If it returns !OK, WithSubprocess returns immediately.
@@ -717,28 +765,6 @@ TEST(ProcCpuinfo, RequiredFieldsArePresent) {
   ASSERT_FALSE(proc_cpuinfo.empty());
   std::vector<std::string> cpuinfo_fields = absl::StrSplit(proc_cpuinfo, '\n');
 
-  // This list of "required" fields is taken from reading the file
-  // arch/x86/kernel/cpu/proc.c and seeing which fields will be unconditionally
-  // printed by the kernel.
-  static const char* required_fields[] = {
-      "processor",
-      "vendor_id",
-      "cpu family",
-      "model\t\t:",
-      "model name",
-      "stepping",
-      "cpu MHz",
-      "fpu\t\t:",
-      "fpu_exception",
-      "cpuid level",
-      "wp",
-      "bogomips",
-      "clflush size",
-      "cache_alignment",
-      "address sizes",
-      "power management",
-  };
-
   // Check that the usual fields are there. We don't really care about the
   // contents.
   for (const std::string& field : required_fields) {
-- 
cgit v1.2.3


From bf6429b944aed6de073c62ceb446cfaed5042dbc Mon Sep 17 00:00:00 2001
From: Brad Burlage <brb@google.com>
Date: Fri, 10 Jan 2020 16:34:59 -0800
Subject: Don't set RWF_HIPRI on InvalidOffset test.

This test fails on ubuntu 18.04 because preadv2 for some reason returns
EOPNOTSUPP instead of EINVAL. Instead of root-causing the failure, I'm dropping
the flag in the preadv2 call since it isn't under test in this scenario.

PiperOrigin-RevId: 289188358
---
 test/syscalls/linux/preadv2.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
index c9246367d..cd936ea90 100644
--- a/test/syscalls/linux/preadv2.cc
+++ b/test/syscalls/linux/preadv2.cc
@@ -202,7 +202,7 @@ TEST(Preadv2Test, TestInvalidOffset) {
   iov[0].iov_len = 0;
 
   EXPECT_THAT(preadv2(fd.get(), iov.get(), /*iovcnt=*/1, /*offset=*/-8,
-                      /*flags=*/RWF_HIPRI),
+                      /*flags=*/0),
               SyscallFailsWithErrno(EINVAL));
 }
 
-- 
cgit v1.2.3


From f54b9c0ee6e02f9c8bf32aa268c9028ff741bf7c Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 13 Jan 2020 10:14:30 -0800
Subject: tests: fix errors detected by asan.

PiperOrigin-RevId: 289467083
---
 test/syscalls/linux/inotify.cc      | 4 ++--
 test/syscalls/linux/poll.cc         | 3 ++-
 test/syscalls/linux/readv_common.cc | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 59ec9940a..fdef646eb 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -977,7 +977,7 @@ TEST(Inotify, WatchOnRelativePath) {
       ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
 
   // Change working directory to root.
-  const char* old_working_dir = get_current_dir_name();
+  const FileDescriptor cwd = ASSERT_NO_ERRNO_AND_VALUE(Open(".", O_PATH));
   EXPECT_THAT(chdir(root.path().c_str()), SyscallSucceeds());
 
   // Add a watch on file1 with a relative path.
@@ -997,7 +997,7 @@ TEST(Inotify, WatchOnRelativePath) {
   // continue to hold a reference, random save/restore tests can fail if a save
   // is triggered after "root" is unlinked; we can't save deleted fs objects
   // with active references.
-  EXPECT_THAT(chdir(old_working_dir), SyscallSucceeds());
+  EXPECT_THAT(fchdir(cwd.get()), SyscallSucceeds());
 }
 
 TEST(Inotify, ZeroLengthReadWriteDoesNotGenerateEvent) {
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
index 9e5aa7fd0..c42472474 100644
--- a/test/syscalls/linux/poll.cc
+++ b/test/syscalls/linux/poll.cc
@@ -275,7 +275,8 @@ TEST_F(PollTest, Nfds) {
   // Each entry in the 'fds' array refers to the eventfd and polls for
   // "writable" events (events=POLLOUT). This essentially guarantees that the
   // poll() is a no-op and allows negative testing of the 'nfds' parameter.
-  std::vector<struct pollfd> fds(max_fds, {.fd = efd.get(), .events = POLLOUT});
+  std::vector<struct pollfd> fds(max_fds + 1,
+                                 {.fd = efd.get(), .events = POLLOUT});
 
   // Verify that 'nfds' up to RLIMIT_NOFILE are allowed.
   EXPECT_THAT(RetryEINTR(poll)(fds.data(), 1, 1), SyscallSucceedsWithValue(1));
diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc
index 491d5f40f..2694dc64f 100644
--- a/test/syscalls/linux/readv_common.cc
+++ b/test/syscalls/linux/readv_common.cc
@@ -154,7 +154,7 @@ void ReadBuffersOverlapping(int fd) {
   char* expected_ptr = expected.data();
   memcpy(expected_ptr, &kReadvTestData[overlap_bytes], overlap_bytes);
   memcpy(&expected_ptr[overlap_bytes], &kReadvTestData[overlap_bytes],
-         kReadvTestDataSize);
+         kReadvTestDataSize - overlap_bytes);
 
   struct iovec iovs[2];
   iovs[0].iov_base = buffer.data();
-- 
cgit v1.2.3


From debd213da61cf35d7c91346820e93fc87bfa5896 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Mon, 13 Jan 2020 14:45:31 -0800
Subject: Allow dual stack sockets to operate on AF_INET

Fixes #1490
Fixes #1495

PiperOrigin-RevId: 289523250
---
 pkg/sentry/socket/netstack/netstack.go      |  65 +++++++++---
 pkg/sentry/socket/unix/unix.go              |   5 +-
 pkg/sentry/strace/socket.go                 |   2 +-
 pkg/tcpip/stack/stack.go                    |  43 ++++++++
 pkg/tcpip/transport/icmp/endpoint.go        |  22 ++--
 pkg/tcpip/transport/tcp/endpoint.go         |  23 +---
 pkg/tcpip/transport/udp/endpoint.go         |  41 ++------
 scripts/common.sh                           |   2 +-
 test/syscalls/linux/BUILD                   |   1 +
 test/syscalls/linux/socket_inet_loopback.cc | 156 ++++++++++++++++++++++++++++
 10 files changed, 278 insertions(+), 82 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 099319327..c020c11cb 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -324,22 +324,15 @@ func bytesToIPAddress(addr []byte) tcpip.Address {
 // converts it to the FullAddress format. It supports AF_UNIX, AF_INET,
 // AF_INET6, and AF_PACKET addresses.
 //
-// strict indicates whether addresses with the AF_UNSPEC family are accepted of not.
-//
 // AddressAndFamily returns an address and its family.
-func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) {
+func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
 	// Make sure we have at least 2 bytes for the address family.
 	if len(addr) < 2 {
 		return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument
 	}
 
-	family := usermem.ByteOrder.Uint16(addr)
-	if family != uint16(sfamily) && (strict || family != linux.AF_UNSPEC) {
-		return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
-	}
-
 	// Get the rest of the fields based on the address family.
-	switch family {
+	switch family := usermem.ByteOrder.Uint16(addr); family {
 	case linux.AF_UNIX:
 		path := addr[2:]
 		if len(path) > linux.UnixPathMax {
@@ -638,10 +631,40 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return r
 }
 
+func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error {
+	if family == uint16(s.family) {
+		return nil
+	}
+	if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 {
+		v, err := s.Endpoint.GetSockOptBool(tcpip.V6OnlyOption)
+		if err != nil {
+			return syserr.TranslateNetstackError(err)
+		}
+		if !v {
+			return nil
+		}
+	}
+	return syserr.ErrInvalidArgument
+}
+
+// mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the
+// receiver's family is AF_INET6.
+//
+// This is a hack to work around the fact that both IPv4 and IPv6 ANY are
+// represented by the empty string.
+//
+// TODO(gvisor.dev/issues/1556): remove this function.
+func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
+	if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
+		addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
+	}
+	return addr
+}
+
 // Connect implements the linux syscall connect(2) for sockets backed by
 // tpcip.Endpoint.
 func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
-	addr, family, err := AddressAndFamily(s.family, sockaddr, false /* strict */)
+	addr, family, err := AddressAndFamily(sockaddr)
 	if err != nil {
 		return err
 	}
@@ -653,6 +676,12 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 		}
 		return syserr.TranslateNetstackError(err)
 	}
+
+	if err := s.checkFamily(family, false /* exact */); err != nil {
+		return err
+	}
+	addr = s.mapFamily(addr, family)
+
 	// Always return right away in the non-blocking case.
 	if !blocking {
 		return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
@@ -681,10 +710,14 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	addr, _, err := AddressAndFamily(s.family, sockaddr, true /* strict */)
+	addr, family, err := AddressAndFamily(sockaddr)
 	if err != nil {
 		return err
 	}
+	if err := s.checkFamily(family, true /* exact */); err != nil {
+		return err
+	}
+	addr = s.mapFamily(addr, family)
 
 	// Issue the bind request to the endpoint.
 	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
@@ -2080,8 +2113,8 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32)
 
 	case linux.AF_INET6:
 		var out linux.SockAddrInet6
-		if len(addr.Addr) == 4 {
-			// Copy address is v4-mapped format.
+		if len(addr.Addr) == header.IPv4AddressSize {
+			// Copy address in v4-mapped format.
 			copy(out.Addr[12:], addr.Addr)
 			out.Addr[10] = 0xff
 			out.Addr[11] = 0xff
@@ -2395,10 +2428,14 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 
 	var addr *tcpip.FullAddress
 	if len(to) > 0 {
-		addrBuf, _, err := AddressAndFamily(s.family, to, true /* strict */)
+		addrBuf, family, err := AddressAndFamily(to)
 		if err != nil {
 			return 0, err
 		}
+		if err := s.checkFamily(family, false /* exact */); err != nil {
+			return 0, err
+		}
+		addrBuf = s.mapFamily(addrBuf, family)
 
 		addr = &addrBuf
 	}
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 91effe89a..7f49ba864 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -116,13 +116,16 @@ func (s *SocketOperations) Endpoint() transport.Endpoint {
 
 // extractPath extracts and validates the address.
 func extractPath(sockaddr []byte) (string, *syserr.Error) {
-	addr, _, err := netstack.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */)
+	addr, family, err := netstack.AddressAndFamily(sockaddr)
 	if err != nil {
 		if err == syserr.ErrAddressFamilyNotSupported {
 			err = syserr.ErrInvalidArgument
 		}
 		return "", err
 	}
+	if family != linux.AF_UNIX {
+		return "", syserr.ErrInvalidArgument
+	}
 
 	// The address is trimmed by GetAddress.
 	p := string(addr.Addr)
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 51f2efb39..b6d7177f4 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -341,7 +341,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string {
 
 	switch family {
 	case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX:
-		fa, _, err := netstack.AddressAndFamily(int(family), b, true /* strict */)
+		fa, _, err := netstack.AddressAndFamily(b)
 		if err != nil {
 			return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err)
 		}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index a47ceba54..113b457fb 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -547,6 +547,49 @@ type TransportEndpointInfo struct {
 	RegisterNICID tcpip.NICID
 }
 
+// AddrNetProto unwraps the specified address if it is a V4-mapped V6 address
+// and returns the network protocol number to be used to communicate with the
+// specified address. It returns an error if the passed address is incompatible
+// with the receiver.
+func (e *TransportEndpointInfo) AddrNetProto(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	netProto := e.NetProto
+	switch len(addr.Addr) {
+	case header.IPv4AddressSize:
+		netProto = header.IPv4ProtocolNumber
+	case header.IPv6AddressSize:
+		if header.IsV4MappedAddress(addr.Addr) {
+			netProto = header.IPv4ProtocolNumber
+			addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
+			if addr.Addr == header.IPv4Any {
+				addr.Addr = ""
+			}
+		}
+	}
+
+	switch len(e.ID.LocalAddress) {
+	case header.IPv4AddressSize:
+		if len(addr.Addr) == header.IPv6AddressSize {
+			return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
+		}
+	case header.IPv6AddressSize:
+		if len(addr.Addr) == header.IPv4AddressSize {
+			return tcpip.FullAddress{}, 0, tcpip.ErrNetworkUnreachable
+		}
+	}
+
+	switch {
+	case netProto == e.NetProto:
+	case netProto == header.IPv4ProtocolNumber && e.NetProto == header.IPv6ProtocolNumber:
+		if v6only {
+			return tcpip.FullAddress{}, 0, tcpip.ErrNoRoute
+		}
+	default:
+		return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
+	}
+
+	return addr, netProto, nil
+}
+
 // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
 // marker interface.
 func (*TransportEndpointInfo) IsEndpointInfo() {}
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 330786f4c..42afb3f5b 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -288,7 +288,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 		toCopy := *to
 		to = &toCopy
-		netProto, err := e.checkV4Mapped(to, true)
+		netProto, err := e.checkV4Mapped(to)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -475,18 +475,12 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	})
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.NetProto
-	if header.IsV4MappedAddress(addr.Addr) {
-		return 0, tcpip.ErrNoRoute
-	}
-
-	// Fail if we're bound to an address length different from the one we're
-	// checking.
-	if l := len(e.ID.LocalAddress); !allowMismatch && l != 0 && l != len(addr.Addr) {
-		return 0, tcpip.ErrInvalidEndpointState
+func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, false /* v6only */)
+	if err != nil {
+		return 0, err
 	}
-
+	*addr = unwrapped
 	return netProto, nil
 }
 
@@ -518,7 +512,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr, false)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
@@ -631,7 +625,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr, false)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index cca511fb9..cc8b533c8 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1691,26 +1691,11 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.NetProto
-	if header.IsV4MappedAddress(addr.Addr) {
-		// Fail if using a v4 mapped address on a v6only endpoint.
-		if e.v6only {
-			return 0, tcpip.ErrNoRoute
-		}
-
-		netProto = header.IPv4ProtocolNumber
-		addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
-		if addr.Addr == header.IPv4Any {
-			addr.Addr = ""
-		}
-	}
-
-	// Fail if we're bound to an address length different from the one we're
-	// checking.
-	if l := len(e.ID.LocalAddress); l != 0 && len(addr.Addr) != 0 && l != len(addr.Addr) {
-		return 0, tcpip.ErrInvalidEndpointState
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+	if err != nil {
+		return 0, err
 	}
-
+	*addr = unwrapped
 	return netProto, nil
 }
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index a4ff29a7d..13446f5d9 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -402,7 +402,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			return 0, nil, tcpip.ErrBroadcastDisabled
 		}
 
-		netProto, err := e.checkV4Mapped(to, false)
+		netProto, err := e.checkV4Mapped(to)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -501,7 +501,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		defer e.mu.Unlock()
 
 		fa := tcpip.FullAddress{Addr: v.InterfaceAddr}
-		netProto, err := e.checkV4Mapped(&fa, false)
+		netProto, err := e.checkV4Mapped(&fa)
 		if err != nil {
 			return err
 		}
@@ -839,35 +839,12 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	return nil
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.NetProto
-	if len(addr.Addr) == 0 {
-		return netProto, nil
-	}
-	if header.IsV4MappedAddress(addr.Addr) {
-		// Fail if using a v4 mapped address on a v6only endpoint.
-		if e.v6only {
-			return 0, tcpip.ErrNoRoute
-		}
-
-		netProto = header.IPv4ProtocolNumber
-		addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
-		if addr.Addr == header.IPv4Any {
-			addr.Addr = ""
-		}
-
-		// Fail if we are bound to an IPv6 address.
-		if !allowMismatch && len(e.ID.LocalAddress) == 16 {
-			return 0, tcpip.ErrNetworkUnreachable
-		}
-	}
-
-	// Fail if we're bound to an address length different from the one we're
-	// checking.
-	if l := len(e.ID.LocalAddress); l != 0 && l != len(addr.Addr) {
-		return 0, tcpip.ErrInvalidEndpointState
+func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+	if err != nil {
+		return 0, err
 	}
-
+	*addr = unwrapped
 	return netProto, nil
 }
 
@@ -916,7 +893,7 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 
 // Connect connects the endpoint to its peer. Specifying a NIC is optional.
 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
-	netProto, err := e.checkV4Mapped(&addr, false)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
@@ -1074,7 +1051,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr, true)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
diff --git a/scripts/common.sh b/scripts/common.sh
index 6dabad141..fdb1aa142 100755
--- a/scripts/common.sh
+++ b/scripts/common.sh
@@ -73,7 +73,7 @@ function install_runsc() {
   sudo "${RUNSC_BIN}" install --experimental=true --runtime="${runtime}" -- --debug-log "${RUNSC_LOGS}" "$@"
 
   # Clear old logs files that may exist.
-  sudo rm -f "${RUNSC_LOGS_DIR}"/*
+  sudo rm -f "${RUNSC_LOGS_DIR}"/'*'
 
   # Restart docker to pick up the new runtime configuration.
   sudo systemctl restart docker
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ce8abe217..4c7ec3f06 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2693,6 +2693,7 @@ cc_binary(
     srcs = ["socket_inet_loopback.cc"],
     linkstatic = 1,
     deps = [
+        ":ip_socket_test_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:posix_error",
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 138024d9e..5d114d460 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -32,6 +32,7 @@
 #include "absl/strings/str_cat.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
@@ -102,6 +103,161 @@ TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) {
               SyscallFailsWithErrno(EAFNOSUPPORT));
 }
 
+enum class Operation {
+  Bind,
+  Connect,
+  SendTo,
+};
+
+std::string OperationToString(Operation operation) {
+  switch (operation) {
+    case Operation::Bind:
+      return "Bind";
+    case Operation::Connect:
+      return "Connect";
+    case Operation::SendTo:
+      return "SendTo";
+  }
+}
+
+using OperationSequence = std::vector<Operation>;
+
+using DualStackSocketTest =
+    ::testing::TestWithParam<std::tuple<TestAddress, OperationSequence>>;
+
+TEST_P(DualStackSocketTest, AddressOperations) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_DGRAM, 0));
+
+  const TestAddress& addr = std::get<0>(GetParam());
+  const OperationSequence& operations = std::get<1>(GetParam());
+
+  auto addr_in = reinterpret_cast<const sockaddr*>(&addr.addr);
+
+  // sockets may only be bound once. Both `connect` and `sendto` cause a socket
+  // to be bound.
+  bool bound = false;
+  for (const Operation& operation : operations) {
+    bool sockname = false;
+    bool peername = false;
+    switch (operation) {
+      case Operation::Bind: {
+        ASSERT_NO_ERRNO(SetAddrPort(
+            addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 0));
+
+        int bind_ret = bind(fd.get(), addr_in, addr.addr_len);
+
+        // Dual stack sockets may only be bound to AF_INET6.
+        if (!bound && addr.family() == AF_INET6) {
+          EXPECT_THAT(bind_ret, SyscallSucceeds());
+          bound = true;
+
+          sockname = true;
+        } else {
+          EXPECT_THAT(bind_ret, SyscallFailsWithErrno(EINVAL));
+        }
+        break;
+      }
+      case Operation::Connect: {
+        ASSERT_NO_ERRNO(SetAddrPort(
+            addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 1337));
+
+        EXPECT_THAT(connect(fd.get(), addr_in, addr.addr_len),
+                    SyscallSucceeds())
+            << GetAddrStr(addr_in);
+        bound = true;
+
+        sockname = true;
+        peername = true;
+
+        break;
+      }
+      case Operation::SendTo: {
+        const char payload[] = "hello";
+        ASSERT_NO_ERRNO(SetAddrPort(
+            addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 1337));
+
+        ssize_t sendto_ret = sendto(fd.get(), &payload, sizeof(payload), 0,
+                                    addr_in, addr.addr_len);
+
+        EXPECT_THAT(sendto_ret, SyscallSucceedsWithValue(sizeof(payload)));
+        sockname = !bound;
+        bound = true;
+        break;
+      }
+    }
+
+    if (sockname) {
+      sockaddr_storage sock_addr;
+      socklen_t addrlen = sizeof(sock_addr);
+      ASSERT_THAT(getsockname(fd.get(), reinterpret_cast<sockaddr*>(&sock_addr),
+                              &addrlen),
+                  SyscallSucceeds());
+      ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6));
+
+      auto sock_addr_in6 = reinterpret_cast<const sockaddr_in6*>(&sock_addr);
+
+      if (operation == Operation::SendTo) {
+        EXPECT_EQ(sock_addr_in6->sin6_family, AF_INET6);
+        EXPECT_TRUE(IN6_IS_ADDR_UNSPECIFIED(sock_addr_in6->sin6_addr.s6_addr32))
+            << OperationToString(operation) << " getsocknam="
+            << GetAddrStr(reinterpret_cast<sockaddr*>(&sock_addr));
+
+        EXPECT_NE(sock_addr_in6->sin6_port, 0);
+      } else if (IN6_IS_ADDR_V4MAPPED(
+                     reinterpret_cast<const sockaddr_in6*>(addr_in)
+                         ->sin6_addr.s6_addr32)) {
+        EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED(sock_addr_in6->sin6_addr.s6_addr32))
+            << OperationToString(operation) << " getsocknam="
+            << GetAddrStr(reinterpret_cast<sockaddr*>(&sock_addr));
+      }
+    }
+
+    if (peername) {
+      sockaddr_storage peer_addr;
+      socklen_t addrlen = sizeof(peer_addr);
+      ASSERT_THAT(getpeername(fd.get(), reinterpret_cast<sockaddr*>(&peer_addr),
+                              &addrlen),
+                  SyscallSucceeds());
+      ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6));
+
+      if (addr.family() == AF_INET ||
+          IN6_IS_ADDR_V4MAPPED(reinterpret_cast<const sockaddr_in6*>(addr_in)
+                                   ->sin6_addr.s6_addr32)) {
+        EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED(
+            reinterpret_cast<const sockaddr_in6*>(&peer_addr)
+                ->sin6_addr.s6_addr32))
+            << OperationToString(operation) << " getpeername="
+            << GetAddrStr(reinterpret_cast<sockaddr*>(&peer_addr));
+      }
+    }
+  }
+}
+
+// TODO(gvisor.dev/issues/1556): uncomment V4MappedAny.
+INSTANTIATE_TEST_SUITE_P(
+    All, DualStackSocketTest,
+    ::testing::Combine(
+        ::testing::Values(V4Any(), V4Loopback(), /*V4MappedAny(),*/
+                          V4MappedLoopback(), V6Any(), V6Loopback()),
+        ::testing::ValuesIn<OperationSequence>(
+            {{Operation::Bind, Operation::Connect, Operation::SendTo},
+             {Operation::Bind, Operation::SendTo, Operation::Connect},
+             {Operation::Connect, Operation::Bind, Operation::SendTo},
+             {Operation::Connect, Operation::SendTo, Operation::Bind},
+             {Operation::SendTo, Operation::Bind, Operation::Connect},
+             {Operation::SendTo, Operation::Connect, Operation::Bind}})),
+    [](::testing::TestParamInfo<
+        std::tuple<TestAddress, OperationSequence>> const& info) {
+      const TestAddress& addr = std::get<0>(info.param);
+      const OperationSequence& operations = std::get<1>(info.param);
+      std::string s = addr.description;
+      for (const Operation& operation : operations) {
+        absl::StrAppend(&s, OperationToString(operation));
+      }
+      return s;
+    });
+
 void tcpSimpleConnectTest(TestAddress const& listener,
                           TestAddress const& connector, bool unbound) {
   // Create the listening socket.
-- 
cgit v1.2.3


From 50625cee59aaff834c7968771ab385ad0e7b0e1f Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Tue, 14 Jan 2020 13:31:52 -0800
Subject: Implement {g,s}etsockopt(IP_RECVTOS) for UDP sockets

PiperOrigin-RevId: 289718534
---
 pkg/sentry/socket/control/control.go         |  2 +-
 pkg/sentry/socket/netstack/netstack.go       | 36 +++++++++++++--
 pkg/tcpip/checker/checker.go                 | 16 +++++++
 pkg/tcpip/stack/nic.go                       |  2 +-
 pkg/tcpip/stack/stack.go                     |  2 +-
 pkg/tcpip/tcpip.go                           |  8 +++-
 pkg/tcpip/transport/udp/endpoint.go          | 40 +++++++++++++++--
 pkg/tcpip/transport/udp/udp_test.go          | 67 ++++++++++++++++++++++++----
 test/syscalls/linux/socket_ip_udp_generic.cc | 40 +++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.cc |  8 ++--
 10 files changed, 197 insertions(+), 24 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 4301b697c..1684dfc24 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -327,7 +327,7 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte {
 }
 
 // PackTOS packs an IP_TOS socket control message.
-func PackTOS(t *kernel.Task, tos int8, buf []byte) []byte {
+func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IP,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index c020c11cb..d2f7e987d 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1268,11 +1268,11 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o uint32
+		var o int32
 		if v {
 			o = 1
 		}
-		return int32(o), nil
+		return o, nil
 
 	case linux.IPV6_PATHMTU:
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1377,6 +1377,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return int32(v), nil
 
+	case linux.IP_RECVTOS:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1895,6 +1910,13 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
 
+	case linux.IP_RECVTOS:
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1915,7 +1937,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
 		linux.IP_RECVORIGDSTADDR,
-		linux.IP_RECVTOS,
 		linux.IP_RECVTTL,
 		linux.IP_RETOPTS,
 		linux.IP_TRANSPARENT,
@@ -2335,7 +2356,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 }
 
 func (s *SocketOperations) controlMessages() socket.ControlMessages {
-	return socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, Timestamp: s.readCM.Timestamp}}
+	return socket.ControlMessages{
+		IP: tcpip.ControlMessages{
+			HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp,
+			Timestamp:    s.readCM.Timestamp,
+			HasTOS:       s.readCM.HasTOS,
+			TOS:          s.readCM.TOS,
+		},
+	}
 }
 
 // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 2f15bf1f1..542abc99d 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -33,6 +33,9 @@ type NetworkChecker func(*testing.T, []header.Network)
 // TransportChecker is a function to check a property of a transport packet.
 type TransportChecker func(*testing.T, header.Transport)
 
+// ControlMessagesChecker is a function to check a property of ancillary data.
+type ControlMessagesChecker func(*testing.T, tcpip.ControlMessages)
+
 // IPv4 checks the validity and properties of the given IPv4 packet. It is
 // expected to be used in conjunction with other network checkers for specific
 // properties. For example, to check the source and destination address, one
@@ -158,6 +161,19 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
+// ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
+func ReceiveTOS(want uint8) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTOS {
+			t.Fatalf("got cm.HasTOS = %t, want cm.TOS = %d", cm.HasTOS, want)
+		}
+		if got := cm.TOS; got != want {
+			t.Fatalf("got cm.TOS = %d, want %d", got, want)
+		}
+	}
+}
+
 // TOS creates a checker that checks the TOS field.
 func TOS(tos uint8, label uint32) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index abf73fe33..071221d5a 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -763,7 +763,7 @@ func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
 	n.mu.Unlock()
 }
 
-// Subnets returns the Subnets associated with this NIC.
+// AddressRanges returns the Subnets associated with this NIC.
 func (n *NIC) AddressRanges() []tcpip.Subnet {
 	n.mu.RLock()
 	defer n.mu.RUnlock()
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index f8d89248e..386eb6eec 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -912,7 +912,7 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	return false
 }
 
-// NICSubnets returns a map of NICIDs to their associated subnets.
+// NICAddressRanges returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 4a090ac86..b7813cbc0 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -322,7 +322,7 @@ type ControlMessages struct {
 	HasTOS bool
 
 	// TOS is the IPv4 type of service of the associated packet.
-	TOS int8
+	TOS uint8
 
 	// HasTClass indicates whether Tclass is valid/set.
 	HasTClass bool
@@ -500,9 +500,13 @@ type WriteOptions struct {
 type SockOptBool int
 
 const (
+	// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
+	// ancillary message is passed with incoming packets.
+	ReceiveTOSOption SockOptBool = iota
+
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
-	V6OnlyOption SockOptBool = iota
+	V6OnlyOption
 )
 
 // SockOptInt represents socket options which values have the int type.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 13446f5d9..c9cbed8f4 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -31,6 +31,7 @@ type udpPacket struct {
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
+	tos           uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -113,6 +114,10 @@ type endpoint struct {
 	// applied while sending packets. Defaults to 0 as on Linux.
 	sendTOS uint8
 
+	// receiveTOS determines if the incoming IPv4 TOS header field is passed
+	// as ancillary data to ControlMessages on Read.
+	receiveTOS bool
+
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -243,7 +248,18 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		*addr = p.senderAddress
 	}
 
-	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
+	cm := tcpip.ControlMessages{
+		HasTimestamp: true,
+		Timestamp:    p.timestamp,
+	}
+	e.mu.RLock()
+	receiveTOS := e.receiveTOS
+	e.mu.RUnlock()
+	if receiveTOS {
+		cm.HasTOS = true
+		cm.TOS = p.tos
+	}
+	return p.data.ToView(), cm, nil
 }
 
 // prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -458,6 +474,12 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 	switch opt {
+	case tcpip.ReceiveTOSOption:
+		e.mu.Lock()
+		e.receiveTOS = v
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -664,15 +686,21 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	switch opt {
+	case tcpip.ReceiveTOSOption:
+		e.mu.RLock()
+		v := e.receiveTOS
+		e.mu.RUnlock()
+		return v, nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
 			return false, tcpip.ErrUnknownProtocolOption
 		}
 
-		e.mu.Lock()
+		e.mu.RLock()
 		v := e.v6only
-		e.mu.Unlock()
+		e.mu.RUnlock()
 
 		return v, nil
 	}
@@ -1215,6 +1243,12 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	e.rcvList.PushBack(packet)
 	e.rcvBufSize += pkt.Data.Size()
 
+	// Save any useful information from the network header to the packet.
+	switch r.NetProto {
+	case header.IPv4ProtocolNumber:
+		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+	}
+
 	packet.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 0a82bc4fa..ee9d10555 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -56,6 +56,7 @@ const (
 	multicastAddr   = "\xe8\x2b\xd3\xea"
 	multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	broadcastAddr   = header.IPv4Broadcast
+	testTOS         = 0x80
 
 	// defaultMTU is the MTU, in bytes, used throughout the tests, except
 	// where another value is explicitly used. It is chosen to match the MTU
@@ -453,6 +454,7 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 	ip := header.IPv4(buf)
 	ip.Encode(&header.IPv4Fields{
 		IHL:         header.IPv4MinimumSize,
+		TOS:         testTOS,
 		TotalLength: uint16(len(buf)),
 		TTL:         65,
 		Protocol:    uint8(udp.ProtocolNumber),
@@ -552,8 +554,8 @@ func TestBindToDeviceOption(t *testing.T) {
 // testReadInternal sends a packet of the given test flow into the stack by
 // injecting it into the link endpoint. It then attempts to read it from the
 // UDP endpoint and depending on if this was expected to succeed verifies its
-// correctness.
-func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool) {
+// correctness including any additional checker functions provided.
+func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool, checkers ...checker.ControlMessagesChecker) {
 	c.t.Helper()
 
 	payload := newPayload()
@@ -568,12 +570,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
 
 	var addr tcpip.FullAddress
-	v, _, err := c.ep.Read(&addr)
+	v, cm, err := c.ep.Read(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for data to become available.
 		select {
 		case <-ch:
-			v, _, err = c.ep.Read(&addr)
+			v, cm, err = c.ep.Read(&addr)
 
 		case <-time.After(300 * time.Millisecond):
 			if packetShouldBeDropped {
@@ -606,15 +608,21 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	if !bytes.Equal(payload, v) {
 		c.t.Fatalf("bad payload: got %x, want %x", v, payload)
 	}
+
+	// Run any checkers against the ControlMessages.
+	for _, f := range checkers {
+		f(c.t, cm)
+	}
+
 	c.checkEndpointReadStats(1, epstats, err)
 }
 
 // testRead sends a packet of the given test flow into the stack by injecting it
 // into the link endpoint. It then reads it from the UDP endpoint and verifies
-// its correctness.
-func testRead(c *testContext, flow testFlow) {
+// its correctness including any additional checker functions provided.
+func testRead(c *testContext, flow testFlow, checkers ...checker.ControlMessagesChecker) {
 	c.t.Helper()
-	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */)
+	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */, checkers...)
 }
 
 // testFailingRead sends a packet of the given test flow into the stack by
@@ -1282,7 +1290,7 @@ func TestTOSV4(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = 0xC0
+			const tos = testTOS
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1317,7 +1325,7 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = 0xC0
+			const tos = testTOS
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1344,6 +1352,47 @@ func TestTOSV6(t *testing.T) {
 	}
 }
 
+func TestReceiveTOSV4(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, broadcast} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Verify that setting and reading the option works.
+			v, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+			if err != nil {
+				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
+			}
+			// Test for expected default value.
+			if v != false {
+				c.t.Errorf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", v, false)
+			}
+
+			want := true
+			if err := c.ep.SetSockOptBool(tcpip.ReceiveTOSOption, want); err != nil {
+				c.t.Fatalf("SetSockOptBool(tcpip.ReceiveTOSOption, %t) failed: %s", want, err)
+			}
+
+			got, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+			if err != nil {
+				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
+			}
+			if got != want {
+				c.t.Fatalf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", got, want)
+			}
+
+			// Verify that the correct received TOS is handed through as
+			// ancillary data to the ControlMessages struct.
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				c.t.Fatal("Bind failed:", err)
+			}
+			testRead(c, flow, checker.ReceiveTOS(testTOS))
+		})
+	}
+}
+
 func TestMulticastInterfaceOption(t *testing.T) {
 	for _, flow := range []testFlow{multicastV4, multicastV4in6, multicastV6, multicastV6Only} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 66eb68857..53290bed7 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -209,6 +209,46 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
+// Ensure that Receiving TOS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS works as expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index dc35c2f50..68e0a8109 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,8 +1349,9 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
+          !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1421,7 +1422,8 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From a611fdaee3c14abe2222140ae0a8a742ebfd31ab Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 14 Jan 2020 14:14:17 -0800
Subject: Changes TCP packet dispatch to use a pool of goroutines.

All inbound segments for connections in ESTABLISHED state are delivered to the
endpoint's queue but for every segment delivered we also queue the endpoint for
processing to a selected processor. This ensures that when there are a large
number of connections in ESTABLISHED state the inbound packets are all handled
by a small number of goroutines and significantly reduces the amount of work the
goscheduler has to perform.

We let connections in other states follow the current path where the
endpoint's goroutine directly handles the segments.

Updates #231

PiperOrigin-RevId: 289728325
---
 benchmarks/tcp/tcp_proxy.go                 |   6 +-
 pkg/sleep/sleep_test.go                     |  31 +++
 pkg/tcpip/stack/transport_demuxer.go        |  54 ++++-
 pkg/tcpip/transport/tcp/BUILD               |  15 +-
 pkg/tcpip/transport/tcp/accept.go           |   9 +-
 pkg/tcpip/transport/tcp/connect.go          | 310 ++++++++++++++++------------
 pkg/tcpip/transport/tcp/dispatcher.go       | 218 +++++++++++++++++++
 pkg/tcpip/transport/tcp/endpoint.go         | 303 ++++++++++++++++++---------
 pkg/tcpip/transport/tcp/endpoint_state.go   |  30 +--
 pkg/tcpip/transport/tcp/protocol.go         |  11 +
 pkg/tcpip/transport/tcp/rcv.go              |  21 +-
 pkg/tcpip/transport/tcp/snd.go              |  14 +-
 pkg/tcpip/transport/tcp/tcp_test.go         |  11 +-
 test/syscalls/linux/socket_inet_loopback.cc |   2 +-
 test/syscalls/linux/tcp_socket.cc           |  14 ++
 15 files changed, 769 insertions(+), 280 deletions(-)
 create mode 100644 pkg/tcpip/transport/tcp/dispatcher.go

(limited to 'test/syscalls/linux')

diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
index be0d7bdd6..dc96add66 100644
--- a/benchmarks/tcp/tcp_proxy.go
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -85,7 +85,7 @@ func (netImpl) printStats() {
 
 const (
 	nicID      = 1       // Fixed.
-	rcvBufSize = 1 << 20 // 1MB.
+	rcvBufSize = 4 << 20 // 1MB.
 )
 
 type netstackImpl struct {
@@ -130,6 +130,10 @@ func setupNetwork(ifaceName string, numChannels int) (fds []int, err error) {
 				return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err)
 			}
 
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, rcvBufSize); err != nil {
+				return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err)
+			}
+
 			if !*swgso && *gso != 0 {
 				if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
 					return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
diff --git a/pkg/sleep/sleep_test.go b/pkg/sleep/sleep_test.go
index 130806c86..af47e2ba1 100644
--- a/pkg/sleep/sleep_test.go
+++ b/pkg/sleep/sleep_test.go
@@ -376,6 +376,37 @@ func TestRace(t *testing.T) {
 	}
 }
 
+// TestRaceInOrder tests that multiple wakers can continuously send wake requests to
+// the sleeper and that the wakers are retrieved in the order asserted.
+func TestRaceInOrder(t *testing.T) {
+	const wakers = 100
+	const wakeRequests = 10000
+
+	w := make([]Waker, wakers)
+	s := Sleeper{}
+
+	// Associate each waker and start goroutines that will assert them.
+	for i := range w {
+		s.AddWaker(&w[i], i)
+	}
+	go func() {
+		n := 0
+		for n < wakeRequests {
+			wk := w[n%len(w)]
+			wk.Assert()
+			n++
+		}
+	}()
+
+	// Wait for all wake up notifications from all wakers.
+	for i := 0; i < wakeRequests; i++ {
+		v, _ := s.Fetch(true)
+		if got, want := v, i%wakers; got != want {
+			t.Fatalf("got  %d want %d", got, want)
+		}
+	}
+}
+
 // BenchmarkSleeperMultiSelect measures how long it takes to fetch a wake up
 // from 4 wakers when at least one is already asserted.
 func BenchmarkSleeperMultiSelect(b *testing.B) {
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index f384a91de..d686e6eb8 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -104,7 +104,14 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, p
 		return
 	}
 	// multiPortEndpoints are guaranteed to have at least one element.
-	selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, pkt)
+	transEP := selectEndpoint(id, mpep, epsByNic.seed)
+	if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue {
+		queuedProtocol.QueuePacket(r, transEP, id, pkt)
+		epsByNic.mu.RUnlock()
+		return
+	}
+
+	transEP.HandlePacket(r, id, pkt)
 	epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
@@ -130,7 +137,7 @@ func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpoint
 
 // registerEndpoint returns true if it succeeds. It fails and returns
 // false if ep already has an element with the same key.
-func (epsByNic *endpointsByNic) registerEndpoint(t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (epsByNic *endpointsByNic) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
 	epsByNic.mu.Lock()
 	defer epsByNic.mu.Unlock()
 
@@ -140,7 +147,7 @@ func (epsByNic *endpointsByNic) registerEndpoint(t TransportEndpoint, reusePort
 	}
 
 	// This is a new binding.
-	multiPortEp := &multiPortEndpoint{}
+	multiPortEp := &multiPortEndpoint{demux: d, netProto: netProto, transProto: transProto}
 	multiPortEp.endpointsMap = make(map[TransportEndpoint]int)
 	multiPortEp.reuse = reusePort
 	epsByNic.endpoints[bindToDevice] = multiPortEp
@@ -168,18 +175,34 @@ func (epsByNic *endpointsByNic) unregisterEndpoint(bindToDevice tcpip.NICID, t T
 // newTransportDemuxer.
 type transportDemuxer struct {
 	// protocol is immutable.
-	protocol map[protocolIDs]*transportEndpoints
+	protocol        map[protocolIDs]*transportEndpoints
+	queuedProtocols map[protocolIDs]queuedTransportProtocol
+}
+
+// queuedTransportProtocol if supported by a protocol implementation will cause
+// the dispatcher to delivery packets to the QueuePacket method instead of
+// calling HandlePacket directly on the endpoint.
+type queuedTransportProtocol interface {
+	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt tcpip.PacketBuffer)
 }
 
 func newTransportDemuxer(stack *Stack) *transportDemuxer {
-	d := &transportDemuxer{protocol: make(map[protocolIDs]*transportEndpoints)}
+	d := &transportDemuxer{
+		protocol:        make(map[protocolIDs]*transportEndpoints),
+		queuedProtocols: make(map[protocolIDs]queuedTransportProtocol),
+	}
 
 	// Add each network and transport pair to the demuxer.
 	for netProto := range stack.networkProtocols {
 		for proto := range stack.transportProtocols {
-			d.protocol[protocolIDs{netProto, proto}] = &transportEndpoints{
+			protoIDs := protocolIDs{netProto, proto}
+			d.protocol[protoIDs] = &transportEndpoints{
 				endpoints: make(map[TransportEndpointID]*endpointsByNic),
 			}
+			qTransProto, isQueued := (stack.transportProtocols[proto].proto).(queuedTransportProtocol)
+			if isQueued {
+				d.queuedProtocols[protoIDs] = qTransProto
+			}
 		}
 	}
 
@@ -209,7 +232,11 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
 //
 // +stateify savable
 type multiPortEndpoint struct {
-	mu           sync.RWMutex `state:"nosave"`
+	mu         sync.RWMutex `state:"nosave"`
+	demux      *transportDemuxer
+	netProto   tcpip.NetworkProtocolNumber
+	transProto tcpip.TransportProtocolNumber
+
 	endpointsArr []TransportEndpoint
 	endpointsMap map[TransportEndpoint]int
 	// reuse indicates if more than one endpoint is allowed.
@@ -258,13 +285,22 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 
 func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
 	ep.mu.RLock()
+	queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
 	for i, endpoint := range ep.endpointsArr {
 		// HandlePacket takes ownership of pkt, so each endpoint needs
 		// its own copy except for the final one.
 		if i == len(ep.endpointsArr)-1 {
+			if mustQueue {
+				queuedProtocol.QueuePacket(r, endpoint, id, pkt)
+				break
+			}
 			endpoint.HandlePacket(r, id, pkt)
 			break
 		}
+		if mustQueue {
+			queuedProtocol.QueuePacket(r, endpoint, id, pkt.Clone())
+			continue
+		}
 		endpoint.HandlePacket(r, id, pkt.Clone())
 	}
 	ep.mu.RUnlock() // Don't use defer for performance reasons.
@@ -357,7 +393,7 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 
 	if epsByNic, ok := eps.endpoints[id]; ok {
 		// There was already a binding.
-		return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
+		return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
 	}
 
 	// This is a new binding.
@@ -367,7 +403,7 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 	}
 	eps.endpoints[id] = epsByNic
 
-	return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
+	return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
 }
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 353bd06f4..0e3ab05ad 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -16,6 +16,18 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "tcp_endpoint_list",
+    out = "tcp_endpoint_list.go",
+    package = "tcp",
+    prefix = "endpoint",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*endpoint",
+        "Linker": "*endpoint",
+    },
+)
+
 go_library(
     name = "tcp",
     srcs = [
@@ -23,6 +35,7 @@ go_library(
         "connect.go",
         "cubic.go",
         "cubic_state.go",
+        "dispatcher.go",
         "endpoint.go",
         "endpoint_state.go",
         "forwarder.go",
@@ -38,6 +51,7 @@ go_library(
         "segment_state.go",
         "snd.go",
         "snd_state.go",
+        "tcp_endpoint_list.go",
         "tcp_segment_list.go",
         "timer.go",
     ],
@@ -45,7 +59,6 @@ go_library(
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 1ea996936..1a2e3efa9 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -285,7 +285,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	// listenEP is nil when listenContext is used by tcp.Forwarder.
 	if l.listenEP != nil {
 		l.listenEP.mu.Lock()
-		if l.listenEP.state != StateListen {
+		if l.listenEP.EndpointState() != StateListen {
 			l.listenEP.mu.Unlock()
 			return nil, tcpip.ErrConnectionAborted
 		}
@@ -344,11 +344,12 @@ func (l *listenContext) closeAllPendingEndpoints() {
 // instead.
 func (e *endpoint) deliverAccepted(n *endpoint) {
 	e.mu.Lock()
-	state := e.state
+	state := e.EndpointState()
 	e.pendingAccepted.Add(1)
 	defer e.pendingAccepted.Done()
 	acceptedChan := e.acceptedChan
 	e.mu.Unlock()
+
 	if state == StateListen {
 		acceptedChan <- n
 		e.waiterQueue.Notify(waiter.EventIn)
@@ -562,8 +563,8 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// We do not use transitionToStateEstablishedLocked here as there is
 		// no handshake state available when doing a SYN cookie based accept.
 		n.stack.Stats().TCP.CurrentEstablished.Increment()
-		n.state = StateEstablished
 		n.isConnectNotified = true
+		n.setEndpointState(StateEstablished)
 
 		// Do the delivery in a separate goroutine so
 		// that we don't block the listen loop in case
@@ -596,7 +597,7 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 		// handleSynSegment() from attempting to queue new connections
 		// to the endpoint.
 		e.mu.Lock()
-		e.state = StateClose
+		e.setEndpointState(StateClose)
 
 		// close any endpoints in SYN-RCVD state.
 		ctx.closeAllPendingEndpoints()
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 613ec1775..f3896715b 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -190,7 +190,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
 	h.ep.mu.Lock()
-	h.ep.state = StateSynRecv
+	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
 }
 
@@ -274,14 +274,14 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// SYN-RCVD state.
 	h.state = handshakeSynRcvd
 	h.ep.mu.Lock()
-	h.ep.state = StateSynRecv
 	ttl := h.ep.ttl
+	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
 	synOpts := header.TCPSynOptions{
 		WS:    int(h.effectiveRcvWndScale()),
 		TS:    rcvSynOpts.TS,
 		TSVal: h.ep.timestamp(),
-		TSEcr: h.ep.recentTS,
+		TSEcr: h.ep.recentTimestamp(),
 
 		// We only send SACKPermitted if the other side indicated it
 		// permits SACK. This is not explicitly defined in the RFC but
@@ -341,7 +341,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			WS:            h.rcvWndScale,
 			TS:            h.ep.sendTSOk,
 			TSVal:         h.ep.timestamp(),
-			TSEcr:         h.ep.recentTS,
+			TSEcr:         h.ep.recentTimestamp(),
 			SACKPermitted: h.ep.sackPermitted,
 			MSS:           h.ep.amss,
 		}
@@ -501,7 +501,7 @@ func (h *handshake) execute() *tcpip.Error {
 		WS:            h.rcvWndScale,
 		TS:            true,
 		TSVal:         h.ep.timestamp(),
-		TSEcr:         h.ep.recentTS,
+		TSEcr:         h.ep.recentTimestamp(),
 		SACKPermitted: bool(sackEnabled),
 		MSS:           h.ep.amss,
 	}
@@ -792,7 +792,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
 		// Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
 		offset += header.EncodeNOP(options[offset:])
 		offset += header.EncodeNOP(options[offset:])
-		offset += header.EncodeTSOption(e.timestamp(), uint32(e.recentTS), options[offset:])
+		offset += header.EncodeTSOption(e.timestamp(), e.recentTimestamp(), options[offset:])
 	}
 	if e.sackPermitted && len(sackBlocks) > 0 {
 		offset += header.EncodeNOP(options[offset:])
@@ -811,7 +811,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
 // sendRaw sends a TCP segment to the endpoint's peer.
 func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
 	var sackBlocks []header.SACKBlock
-	if e.state == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
+	if e.EndpointState() == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
 		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
 	}
 	options := e.makeOptions(sackBlocks)
@@ -848,6 +848,9 @@ func (e *endpoint) handleWrite() *tcpip.Error {
 }
 
 func (e *endpoint) handleClose() *tcpip.Error {
+	if !e.EndpointState().connected() {
+		return nil
+	}
 	// Drain the send queue.
 	e.handleWrite()
 
@@ -864,11 +867,7 @@ func (e *endpoint) handleClose() *tcpip.Error {
 func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	// Only send a reset if the connection is being aborted for a reason
 	// other than receiving a reset.
-	if e.state == StateEstablished || e.state == StateCloseWait {
-		e.stack.Stats().TCP.EstablishedResets.Increment()
-		e.stack.Stats().TCP.CurrentEstablished.Decrement()
-	}
-	e.state = StateError
+	e.setEndpointState(StateError)
 	e.HardError = err
 	if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout {
 		// The exact sequence number to be used for the RST is the same as the
@@ -888,9 +887,12 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 }
 
 // completeWorkerLocked is called by the worker goroutine when it's about to
-// exit. It marks the worker as completed and performs cleanup work if requested
-// by Close().
+// exit.
 func (e *endpoint) completeWorkerLocked() {
+	// Worker is terminating(either due to moving to
+	// CLOSED or ERROR state, ensure we release all
+	// registrations port reservations even if the socket
+	// itself is not yet closed by the application.
 	e.workerRunning = false
 	if e.workerCleanup {
 		e.cleanupLocked()
@@ -917,8 +919,7 @@ func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
 		e.rcvAutoParams.prevCopied = int(h.rcvWnd)
 		e.rcvListMu.Unlock()
 	}
-	h.ep.stack.Stats().TCP.CurrentEstablished.Increment()
-	e.state = StateEstablished
+	e.setEndpointState(StateEstablished)
 }
 
 // transitionToStateCloseLocked ensures that the endpoint is
@@ -927,11 +928,12 @@ func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
 // delivered to this endpoint from the demuxer when the endpoint
 // is transitioned to StateClose.
 func (e *endpoint) transitionToStateCloseLocked() {
-	if e.state == StateClose {
+	if e.EndpointState() == StateClose {
 		return
 	}
+	// Mark the endpoint as fully closed for reads/writes.
 	e.cleanupLocked()
-	e.state = StateClose
+	e.setEndpointState(StateClose)
 	e.stack.Stats().TCP.EstablishedClosed.Increment()
 }
 
@@ -946,7 +948,9 @@ func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 		s.decRef()
 		return
 	}
-	ep.(*endpoint).enqueueSegment(s)
+	if ep.(*endpoint).enqueueSegment(s) {
+		ep.(*endpoint).newSegmentWaker.Assert()
+	}
 }
 
 func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
@@ -955,9 +959,8 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 		// except SYN-SENT, all reset (RST) segments are
 		// validated by checking their SEQ-fields." So
 		// we only process it if it's acceptable.
-		s.decRef()
 		e.mu.Lock()
-		switch e.state {
+		switch e.EndpointState() {
 		// In case of a RST in CLOSE-WAIT linux moves
 		// the socket to closed state with an error set
 		// to indicate EPIPE.
@@ -981,103 +984,57 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 			e.transitionToStateCloseLocked()
 			e.HardError = tcpip.ErrAborted
 			e.mu.Unlock()
+			e.notifyProtocolGoroutine(notifyTickleWorker)
 			return false, nil
 		default:
 			e.mu.Unlock()
+			// RFC 793, page 37 states that "in all states
+			// except SYN-SENT, all reset (RST) segments are
+			// validated by checking their SEQ-fields." So
+			// we only process it if it's acceptable.
+
+			// Notify protocol goroutine. This is required when
+			// handleSegment is invoked from the processor goroutine
+			// rather than the worker goroutine.
+			e.notifyProtocolGoroutine(notifyResetByPeer)
 			return false, tcpip.ErrConnectionReset
 		}
 	}
 	return true, nil
 }
 
-// handleSegments pulls segments from the queue and processes them. It returns
-// no error if the protocol loop should continue, an error otherwise.
-func (e *endpoint) handleSegments() *tcpip.Error {
+// handleSegments processes all inbound segments.
+func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error {
 	checkRequeue := true
 	for i := 0; i < maxSegmentsPerWake; i++ {
+		if e.EndpointState() == StateClose || e.EndpointState() == StateError {
+			return nil
+		}
 		s := e.segmentQueue.dequeue()
 		if s == nil {
 			checkRequeue = false
 			break
 		}
 
-		// Invoke the tcp probe if installed.
-		if e.probe != nil {
-			e.probe(e.completeState())
+		cont, err := e.handleSegment(s)
+		if err != nil {
+			s.decRef()
+			e.mu.Lock()
+			e.setEndpointState(StateError)
+			e.HardError = err
+			e.mu.Unlock()
+			return err
 		}
-
-		if s.flagIsSet(header.TCPFlagRst) {
-			if ok, err := e.handleReset(s); !ok {
-				return err
-			}
-		} else if s.flagIsSet(header.TCPFlagSyn) {
-			// See: https://tools.ietf.org/html/rfc5961#section-4.1
-			//   1) If the SYN bit is set, irrespective of the sequence number, TCP
-			//    MUST send an ACK (also referred to as challenge ACK) to the remote
-			//    peer:
-			//
-			//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
-			//
-			//    After sending the acknowledgment, TCP MUST drop the unacceptable
-			//    segment and stop processing further.
-			//
-			// By sending an ACK, the remote peer is challenged to confirm the loss
-			// of the previous connection and the request to start a new connection.
-			// A legitimate peer, after restart, would not have a TCB in the
-			// synchronized state.  Thus, when the ACK arrives, the peer should send
-			// a RST segment back with the sequence number derived from the ACK
-			// field that caused the RST.
-
-			// This RST will confirm that the remote peer has indeed closed the
-			// previous connection.  Upon receipt of a valid RST, the local TCP
-			// endpoint MUST terminate its connection.  The local TCP endpoint
-			// should then rely on SYN retransmission from the remote end to
-			// re-establish the connection.
-
-			e.snd.sendAck()
-		} else if s.flagIsSet(header.TCPFlagAck) {
-			// Patch the window size in the segment according to the
-			// send window scale.
-			s.window <<= e.snd.sndWndScale
-
-			// RFC 793, page 41 states that "once in the ESTABLISHED
-			// state all segments must carry current acknowledgment
-			// information."
-			drop, err := e.rcv.handleRcvdSegment(s)
-			if err != nil {
-				s.decRef()
-				return err
-			}
-			if drop {
-				s.decRef()
-				continue
-			}
-
-			// Now check if the received segment has caused us to transition
-			// to a CLOSED state, if yes then terminate processing and do
-			// not invoke the sender.
-			e.mu.RLock()
-			state := e.state
-			e.mu.RUnlock()
-			if state == StateClose {
-				// When we get into StateClose while processing from the queue,
-				// return immediately and let the protocolMainloop handle it.
-				//
-				// We can reach StateClose only while processing a previous segment
-				// or a notification from the protocolMainLoop (caller goroutine).
-				// This means that with this return, the segment dequeue below can
-				// never occur on a closed endpoint.
-				s.decRef()
-				return nil
-			}
-			e.snd.handleRcvdSegment(s)
+		if !cont {
+			s.decRef()
+			return nil
 		}
-		s.decRef()
 	}
 
-	// If the queue is not empty, make sure we'll wake up in the next
-	// iteration.
-	if checkRequeue && !e.segmentQueue.empty() {
+	// When fastPath is true we don't want to wake up the worker
+	// goroutine. If the endpoint has more segments to process the
+	// dispatcher will call handleSegments again anyway.
+	if !fastPath && checkRequeue && !e.segmentQueue.empty() {
 		e.newSegmentWaker.Assert()
 	}
 
@@ -1086,11 +1043,88 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 		e.snd.sendAck()
 	}
 
-	e.resetKeepaliveTimer(true)
+	e.resetKeepaliveTimer(true /* receivedData */)
 
 	return nil
 }
 
+// handleSegment handles a given segment and notifies the worker goroutine if
+// if the connection should be terminated.
+func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
+	// Invoke the tcp probe if installed.
+	if e.probe != nil {
+		e.probe(e.completeState())
+	}
+
+	if s.flagIsSet(header.TCPFlagRst) {
+		if ok, err := e.handleReset(s); !ok {
+			return false, err
+		}
+	} else if s.flagIsSet(header.TCPFlagSyn) {
+		// See: https://tools.ietf.org/html/rfc5961#section-4.1
+		//   1) If the SYN bit is set, irrespective of the sequence number, TCP
+		//    MUST send an ACK (also referred to as challenge ACK) to the remote
+		//    peer:
+		//
+		//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+		//
+		//    After sending the acknowledgment, TCP MUST drop the unacceptable
+		//    segment and stop processing further.
+		//
+		// By sending an ACK, the remote peer is challenged to confirm the loss
+		// of the previous connection and the request to start a new connection.
+		// A legitimate peer, after restart, would not have a TCB in the
+		// synchronized state.  Thus, when the ACK arrives, the peer should send
+		// a RST segment back with the sequence number derived from the ACK
+		// field that caused the RST.
+
+		// This RST will confirm that the remote peer has indeed closed the
+		// previous connection.  Upon receipt of a valid RST, the local TCP
+		// endpoint MUST terminate its connection.  The local TCP endpoint
+		// should then rely on SYN retransmission from the remote end to
+		// re-establish the connection.
+
+		e.snd.sendAck()
+	} else if s.flagIsSet(header.TCPFlagAck) {
+		// Patch the window size in the segment according to the
+		// send window scale.
+		s.window <<= e.snd.sndWndScale
+
+		// RFC 793, page 41 states that "once in the ESTABLISHED
+		// state all segments must carry current acknowledgment
+		// information."
+		drop, err := e.rcv.handleRcvdSegment(s)
+		if err != nil {
+			return false, err
+		}
+		if drop {
+			return true, nil
+		}
+
+		// Now check if the received segment has caused us to transition
+		// to a CLOSED state, if yes then terminate processing and do
+		// not invoke the sender.
+		e.mu.RLock()
+		state := e.state
+		e.mu.RUnlock()
+		if state == StateClose {
+			// When we get into StateClose while processing from the queue,
+			// return immediately and let the protocolMainloop handle it.
+			//
+			// We can reach StateClose only while processing a previous segment
+			// or a notification from the protocolMainLoop (caller goroutine).
+			// This means that with this return, the segment dequeue below can
+			// never occur on a closed endpoint.
+			s.decRef()
+			return false, nil
+		}
+
+		e.snd.handleRcvdSegment(s)
+	}
+
+	return true, nil
+}
+
 // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
 // keepalive packets periodically when the connection is idle. If we don't hear
 // from the other side after a number of tries, we terminate the connection.
@@ -1160,7 +1194,7 @@ func (e *endpoint) disableKeepaliveTimer() {
 // protocolMainLoop is the main loop of the TCP protocol. It runs in its own
 // goroutine and is responsible for sending segments and handling received
 // segments.
-func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
+func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) *tcpip.Error {
 	var closeTimer *time.Timer
 	var closeWaker sleep.Waker
 
@@ -1182,6 +1216,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		}
 
 		e.mu.Unlock()
+		e.workMu.Unlock()
 		// When the protocol loop exits we should wake up our waiters.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}
@@ -1193,7 +1228,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		initialRcvWnd := e.initialReceiveWindow()
 		h := newHandshake(e, seqnum.Size(initialRcvWnd))
 		e.mu.Lock()
-		h.ep.state = StateSynSent
+		h.ep.setEndpointState(StateSynSent)
 		e.mu.Unlock()
 
 		if err := h.execute(); err != nil {
@@ -1202,12 +1237,11 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			e.lastErrorMu.Unlock()
 
 			e.mu.Lock()
-			e.state = StateError
+			e.setEndpointState(StateError)
 			e.HardError = err
 
 			// Lock released below.
 			epilogue()
-
 			return err
 		}
 	}
@@ -1215,7 +1249,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	e.keepalive.timer.init(&e.keepalive.waker)
 	defer e.keepalive.timer.cleanup()
 
-	// Tell waiters that the endpoint is connected and writable.
 	e.mu.Lock()
 	drained := e.drainDone != nil
 	e.mu.Unlock()
@@ -1224,8 +1257,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		<-e.undrain
 	}
 
-	e.waiterQueue.Notify(waiter.EventOut)
-
 	// Set up the functions that will be called when the main protocol loop
 	// wakes up.
 	funcs := []struct {
@@ -1240,18 +1271,15 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			w: &e.sndCloseWaker,
 			f: e.handleClose,
 		},
-		{
-			w: &e.newSegmentWaker,
-			f: e.handleSegments,
-		},
 		{
 			w: &closeWaker,
 			f: func() *tcpip.Error {
 				// This means the socket is being closed due
-				// to the TCP_FIN_WAIT2 timeout was hit. Just
+				// to the TCP-FIN-WAIT2 timeout was hit. Just
 				// mark the socket as closed.
 				e.mu.Lock()
 				e.transitionToStateCloseLocked()
+				e.workerCleanup = true
 				e.mu.Unlock()
 				return nil
 			},
@@ -1266,6 +1294,12 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 				return nil
 			},
 		},
+		{
+			w: &e.newSegmentWaker,
+			f: func() *tcpip.Error {
+				return e.handleSegments(false /* fastPath */)
+			},
+		},
 		{
 			w: &e.keepalive.waker,
 			f: e.keepaliveTimerExpired,
@@ -1293,14 +1327,16 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 				}
 
 				if n&notifyReset != 0 {
-					e.mu.Lock()
-					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
-					e.mu.Unlock()
+					return tcpip.ErrConnectionAborted
+				}
+
+				if n&notifyResetByPeer != 0 {
+					return tcpip.ErrConnectionReset
 				}
 
 				if n&notifyClose != 0 && closeTimer == nil {
 					e.mu.Lock()
-					if e.state == StateFinWait2 && e.closed {
+					if e.EndpointState() == StateFinWait2 && e.closed {
 						// The socket has been closed and we are in FIN_WAIT2
 						// so start the FIN_WAIT2 timer.
 						closeTimer = time.AfterFunc(e.tcpLingerTimeout, func() {
@@ -1320,11 +1356,11 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 				if n&notifyDrain != 0 {
 					for !e.segmentQueue.empty() {
-						if err := e.handleSegments(); err != nil {
+						if err := e.handleSegments(false /* fastPath */); err != nil {
 							return err
 						}
 					}
-					if e.state != StateClose && e.state != StateError {
+					if e.EndpointState() != StateClose && e.EndpointState() != StateError {
 						// Only block the worker if the endpoint
 						// is not in closed state or error state.
 						close(e.drainDone)
@@ -1349,14 +1385,21 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		s.AddWaker(funcs[i].w, i)
 	}
 
+	// Notify the caller that the waker initialization is complete and the
+	// endpoint is ready.
+	if wakerInitDone != nil {
+		close(wakerInitDone)
+	}
+
+	// Tell waiters that the endpoint is connected and writable.
+	e.waiterQueue.Notify(waiter.EventOut)
+
 	// The following assertions and notifications are needed for restored
 	// endpoints. Fresh newly created endpoints have empty states and should
 	// not invoke any.
-	e.segmentQueue.mu.Lock()
-	if !e.segmentQueue.list.Empty() {
+	if !e.segmentQueue.empty() {
 		e.newSegmentWaker.Assert()
 	}
-	e.segmentQueue.mu.Unlock()
 
 	e.rcvListMu.Lock()
 	if !e.rcvList.Empty() {
@@ -1372,27 +1415,32 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
 
-	for e.state != StateTimeWait && e.state != StateClose && e.state != StateError {
+	for e.EndpointState() != StateTimeWait && e.EndpointState() != StateClose && e.EndpointState() != StateError {
 		e.mu.Unlock()
 		e.workMu.Unlock()
 		v, _ := s.Fetch(true)
 		e.workMu.Lock()
+		// We need to double check here because the notification
+		// maybe stale by the time we got around to processing it.
+		// NOTE: since we now hold the workMu the processors cannot
+		// change the state of the endpoint so it' safe to proceed
+		// after this check.
+		if e.EndpointState() == StateTimeWait || e.EndpointState() == StateClose || e.EndpointState() == StateError {
+			e.mu.Lock()
+			break
+		}
 		if err := funcs[v].f(); err != nil {
 			e.mu.Lock()
-			// Ensure we release all endpoint registration and route
-			// references as the connection is now in an error
-			// state.
 			e.workerCleanup = true
 			e.resetConnectionLocked(err)
 			// Lock released below.
 			epilogue()
-
 			return nil
 		}
 		e.mu.Lock()
 	}
 
-	state := e.state
+	state := e.EndpointState()
 	e.mu.Unlock()
 	var reuseTW func()
 	if state == StateTimeWait {
@@ -1405,13 +1453,15 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		s.Done()
 		// Wake up any waiters before we enter TIME_WAIT.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+		e.mu.Lock()
+		e.workerCleanup = true
+		e.mu.Unlock()
 		reuseTW = e.doTimeWait()
 	}
 
 	// Mark endpoint as closed.
 	e.mu.Lock()
-	if e.state != StateError {
-		e.stack.Stats().TCP.CurrentEstablished.Decrement()
+	if e.EndpointState() != StateError {
 		e.transitionToStateCloseLocked()
 	}
 
@@ -1468,7 +1518,11 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()
 					tcpEP := listenEP.(*endpoint)
 					if EndpointState(tcpEP.State()) == StateListen {
 						reuseTW = func() {
-							tcpEP.enqueueSegment(s)
+							if !tcpEP.enqueueSegment(s) {
+								s.decRef()
+								return
+							}
+							tcpEP.newSegmentWaker.Assert()
 						}
 						// We explicitly do not decRef
 						// the segment as it's still
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
new file mode 100644
index 000000000..a72f0c379
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -0,0 +1,218 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// epQueue is a queue of endpoints.
+type epQueue struct {
+	mu   sync.Mutex
+	list endpointList
+}
+
+// enqueue adds e to the queue if the endpoint is not already on the queue.
+func (q *epQueue) enqueue(e *endpoint) {
+	q.mu.Lock()
+	if e.pendingProcessing {
+		q.mu.Unlock()
+		return
+	}
+	q.list.PushBack(e)
+	e.pendingProcessing = true
+	q.mu.Unlock()
+}
+
+// dequeue removes and returns the first element from the queue if available,
+// returns nil otherwise.
+func (q *epQueue) dequeue() *endpoint {
+	q.mu.Lock()
+	if e := q.list.Front(); e != nil {
+		q.list.Remove(e)
+		e.pendingProcessing = false
+		q.mu.Unlock()
+		return e
+	}
+	q.mu.Unlock()
+	return nil
+}
+
+// empty returns true if the queue is empty, false otherwise.
+func (q *epQueue) empty() bool {
+	q.mu.Lock()
+	v := q.list.Empty()
+	q.mu.Unlock()
+	return v
+}
+
+// processor is responsible for processing packets queued to a tcp endpoint.
+type processor struct {
+	epQ              epQueue
+	newEndpointWaker sleep.Waker
+	id               int
+}
+
+func newProcessor(id int) *processor {
+	p := &processor{
+		id: id,
+	}
+	go p.handleSegments()
+	return p
+}
+
+func (p *processor) queueEndpoint(ep *endpoint) {
+	// Queue an endpoint for processing by the processor goroutine.
+	p.epQ.enqueue(ep)
+	p.newEndpointWaker.Assert()
+}
+
+func (p *processor) handleSegments() {
+	const newEndpointWaker = 1
+	s := sleep.Sleeper{}
+	s.AddWaker(&p.newEndpointWaker, newEndpointWaker)
+	defer s.Done()
+	for {
+		s.Fetch(true)
+		for ep := p.epQ.dequeue(); ep != nil; ep = p.epQ.dequeue() {
+			if ep.segmentQueue.empty() {
+				continue
+			}
+
+			// If socket has transitioned out of connected state
+			// then just let the worker handle the packet.
+			//
+			// NOTE: We read this outside of e.mu lock which means
+			// that by the time we get to handleSegments the
+			// endpoint may not be in ESTABLISHED. But this should
+			// be fine as all normal shutdown states are handled by
+			// handleSegments and if the endpoint moves to a
+			// CLOSED/ERROR state then handleSegments is a noop.
+			if ep.EndpointState() != StateEstablished {
+				ep.newSegmentWaker.Assert()
+				continue
+			}
+
+			if !ep.workMu.TryLock() {
+				ep.newSegmentWaker.Assert()
+				continue
+			}
+			// If the endpoint is in a connected state then we do
+			// direct delivery to ensure low latency and avoid
+			// scheduler interactions.
+			if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose {
+				ep.notifyProtocolGoroutine(notifyTickleWorker)
+				ep.workMu.Unlock()
+				continue
+			}
+
+			if !ep.segmentQueue.empty() {
+				p.epQ.enqueue(ep)
+			}
+
+			ep.workMu.Unlock()
+		}
+	}
+}
+
+// dispatcher manages a pool of TCP endpoint processors which are responsible
+// for the processing of inbound segments. This fixed pool of processor
+// goroutines do full tcp processing. The processor is selected based on the
+// hash of the endpoint id to ensure that delivery for the same endpoint happens
+// in-order.
+type dispatcher struct {
+	processors []*processor
+	seed       uint32
+}
+
+func newDispatcher(nProcessors int) *dispatcher {
+	processors := []*processor{}
+	for i := 0; i < nProcessors; i++ {
+		processors = append(processors, newProcessor(i))
+	}
+	return &dispatcher{
+		processors: processors,
+		seed:       generateRandUint32(),
+	}
+}
+
+func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+	ep := stackEP.(*endpoint)
+	s := newSegment(r, id, pkt)
+	if !s.parse() {
+		ep.stack.Stats().MalformedRcvdPackets.Increment()
+		ep.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
+		ep.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+		s.decRef()
+		return
+	}
+
+	if !s.csumValid {
+		ep.stack.Stats().MalformedRcvdPackets.Increment()
+		ep.stack.Stats().TCP.ChecksumErrors.Increment()
+		ep.stats.ReceiveErrors.ChecksumErrors.Increment()
+		s.decRef()
+		return
+	}
+
+	ep.stack.Stats().TCP.ValidSegmentsReceived.Increment()
+	ep.stats.SegmentsReceived.Increment()
+	if (s.flags & header.TCPFlagRst) != 0 {
+		ep.stack.Stats().TCP.ResetsReceived.Increment()
+	}
+
+	if !ep.enqueueSegment(s) {
+		s.decRef()
+		return
+	}
+
+	// For sockets not in established state let the worker goroutine
+	// handle the packets.
+	if ep.EndpointState() != StateEstablished {
+		ep.newSegmentWaker.Assert()
+		return
+	}
+
+	d.selectProcessor(id).queueEndpoint(ep)
+}
+
+func generateRandUint32() uint32 {
+	b := make([]byte, 4)
+	if _, err := rand.Read(b); err != nil {
+		panic(err)
+	}
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func (d *dispatcher) selectProcessor(id stack.TransportEndpointID) *processor {
+	payload := []byte{
+		byte(id.LocalPort),
+		byte(id.LocalPort >> 8),
+		byte(id.RemotePort),
+		byte(id.RemotePort >> 8)}
+
+	h := jenkins.Sum32(d.seed)
+	h.Write(payload)
+	h.Write([]byte(id.LocalAddress))
+	h.Write([]byte(id.RemoteAddress))
+
+	return d.processors[h.Sum32()%uint32(len(d.processors))]
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index cc8b533c8..1799c6e10 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -120,6 +120,7 @@ const (
 	notifyMTUChanged
 	notifyDrain
 	notifyReset
+	notifyResetByPeer
 	notifyKeepaliveChanged
 	notifyMSSChanged
 	// notifyTickleWorker is used to tickle the protocol main loop during a
@@ -127,6 +128,7 @@ const (
 	// ensures the loop terminates if the final state of the endpoint is
 	// say TIME_WAIT.
 	notifyTickleWorker
+	notifyError
 )
 
 // SACKInfo holds TCP SACK related information for a given endpoint.
@@ -283,6 +285,18 @@ func (*EndpointInfo) IsEndpointInfo() {}
 type endpoint struct {
 	EndpointInfo
 
+	// endpointEntry is used to queue endpoints for processing to the
+	// a given tcp processor goroutine.
+	//
+	// Precondition: epQueue.mu must be held to read/write this field..
+	endpointEntry `state:"nosave"`
+
+	// pendingProcessing is true if this endpoint is queued for processing
+	// to a TCP processor.
+	//
+	// Precondition: epQueue.mu must be held to read/write this field..
+	pendingProcessing bool `state:"nosave"`
+
 	// workMu is used to arbitrate which goroutine may perform protocol
 	// work. Only the main protocol goroutine is expected to call Lock() on
 	// it, but other goroutines (e.g., send) may call TryLock() to eagerly
@@ -324,6 +338,7 @@ type endpoint struct {
 	// The following fields are protected by the mutex.
 	mu sync.RWMutex `state:"nosave"`
 
+	// state must be read/set using the EndpointState()/setEndpointState() methods.
 	state EndpointState `state:".(EndpointState)"`
 
 	// origEndpointState is only used during a restore phase to save the
@@ -359,7 +374,7 @@ type endpoint struct {
 	workerRunning bool
 
 	// workerCleanup specifies if the worker goroutine must perform cleanup
-	// before exitting. This can only be set to true when workerRunning is
+	// before exiting. This can only be set to true when workerRunning is
 	// also true, and they're both protected by the mutex.
 	workerCleanup bool
 
@@ -371,6 +386,8 @@ type endpoint struct {
 	// recentTS is the timestamp that should be sent in the TSEcr field of
 	// the timestamp for future segments sent by the endpoint. This field is
 	// updated if required when a new segment is received by this endpoint.
+	//
+	// recentTS must be read/written atomically.
 	recentTS uint32
 
 	// tsOffset is a randomized offset added to the value of the
@@ -567,6 +584,47 @@ func (e *endpoint) ResumeWork() {
 	e.workMu.Unlock()
 }
 
+// setEndpointState updates the state of the endpoint to state atomically. This
+// method is unexported as the only place we should update the state is in this
+// package but we allow the state to be read freely without holding e.mu.
+//
+// Precondition: e.mu must be held to call this method.
+func (e *endpoint) setEndpointState(state EndpointState) {
+	oldstate := EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+	switch state {
+	case StateEstablished:
+		e.stack.Stats().TCP.CurrentEstablished.Increment()
+	case StateError:
+		fallthrough
+	case StateClose:
+		if oldstate == StateCloseWait || oldstate == StateEstablished {
+			e.stack.Stats().TCP.EstablishedResets.Increment()
+		}
+		fallthrough
+	default:
+		if oldstate == StateEstablished {
+			e.stack.Stats().TCP.CurrentEstablished.Decrement()
+		}
+	}
+	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
+}
+
+// EndpointState returns the current state of the endpoint.
+func (e *endpoint) EndpointState() EndpointState {
+	return EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+}
+
+// setRecentTimestamp atomically sets the recentTS field to the
+// provided value.
+func (e *endpoint) setRecentTimestamp(recentTS uint32) {
+	atomic.StoreUint32(&e.recentTS, recentTS)
+}
+
+// recentTimestamp atomically reads and returns the value of the recentTS field.
+func (e *endpoint) recentTimestamp() uint32 {
+	return atomic.LoadUint32(&e.recentTS)
+}
+
 // keepalive is a synchronization wrapper used to appease stateify. See the
 // comment in endpoint, where it is used.
 //
@@ -656,7 +714,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
-	switch e.state {
+	switch e.EndpointState() {
 	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
 		// Ready for nothing.
 
@@ -672,7 +730,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 			}
 		}
 	}
-	if e.state.connected() {
+	if e.EndpointState().connected() {
 		// Determine if the endpoint is writable if requested.
 		if (mask & waiter.EventOut) != 0 {
 			e.sndBufMu.Lock()
@@ -733,14 +791,20 @@ func (e *endpoint) Close() {
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
 	e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
+	e.closeNoShutdown()
+}
 
+// closeNoShutdown closes the endpoint without doing a full shutdown. This is
+// used when a connection needs to be aborted with a RST and we want to skip
+// a full 4 way TCP shutdown.
+func (e *endpoint) closeNoShutdown() {
 	e.mu.Lock()
 
 	// For listening sockets, we always release ports inline so that they
 	// are immediately available for reuse after Close() is called. If also
 	// registered, we unregister as well otherwise the next user would fail
 	// in Listen() when trying to register.
-	if e.state == StateListen && e.isPortReserved {
+	if e.EndpointState() == StateListen && e.isPortReserved {
 		if e.isRegistered {
 			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
 			e.isRegistered = false
@@ -780,6 +844,8 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 		defer close(done)
 		for n := range e.acceptedChan {
 			n.notifyProtocolGoroutine(notifyReset)
+			// close all connections that have completed but
+			// not accepted by the application.
 			n.Close()
 		}
 	}()
@@ -797,11 +863,13 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 // after Close() is called and the worker goroutine (if any) is done with its
 // work.
 func (e *endpoint) cleanupLocked() {
+
 	// Close all endpoints that might have been accepted by TCP but not by
 	// the client.
 	if e.acceptedChan != nil {
 		e.closePendingAcceptableConnectionsLocked()
 	}
+
 	e.workerCleanup = false
 
 	if e.isRegistered {
@@ -920,7 +988,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 	// reads to proceed before returning a ECONNRESET.
 	e.rcvListMu.Lock()
 	bufUsed := e.rcvBufUsed
-	if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 {
+	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
 		e.rcvListMu.Unlock()
 		he := e.HardError
 		e.mu.RUnlock()
@@ -944,7 +1012,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	if e.rcvBufUsed == 0 {
-		if e.rcvClosed || !e.state.connected() {
+		if e.rcvClosed || !e.EndpointState().connected() {
 			return buffer.View{}, tcpip.ErrClosedForReceive
 		}
 		return buffer.View{}, tcpip.ErrWouldBlock
@@ -980,8 +1048,8 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 // Caller must hold e.mu and e.sndBufMu
 func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
 	// The endpoint cannot be written to if it's not connected.
-	if !e.state.connected() {
-		switch e.state {
+	if !e.EndpointState().connected() {
+		switch e.EndpointState() {
 		case StateError:
 			return 0, e.HardError
 		default:
@@ -1039,42 +1107,86 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		return 0, nil, perr
 	}
 
-	if !opts.Atomic { // See above.
-		e.mu.RLock()
-		e.sndBufMu.Lock()
+	if opts.Atomic {
+		// Add data to the send queue.
+		s := newSegmentFromView(&e.route, e.ID, v)
+		e.sndBufUsed += len(v)
+		e.sndBufInQueue += seqnum.Size(len(v))
+		e.sndQueue.PushBack(s)
+		e.sndBufMu.Unlock()
+		// Release the endpoint lock to prevent deadlocks due to lock
+		// order inversion when acquiring workMu.
+		e.mu.RUnlock()
+	}
 
-		// Because we released the lock before copying, check state again
-		// to make sure the endpoint is still in a valid state for a write.
-		avail, err = e.isEndpointWritableLocked()
-		if err != nil {
+	if e.workMu.TryLock() {
+		// Since we released locks in between it's possible that the
+		// endpoint transitioned to a CLOSED/ERROR states so make
+		// sure endpoint is still writable before trying to write.
+		if !opts.Atomic { // See above.
+			e.mu.RLock()
+			e.sndBufMu.Lock()
+
+			// Because we released the lock before copying, check state again
+			// to make sure the endpoint is still in a valid state for a write.
+			avail, err = e.isEndpointWritableLocked()
+			if err != nil {
+				e.sndBufMu.Unlock()
+				e.mu.RUnlock()
+				e.stats.WriteErrors.WriteClosed.Increment()
+				return 0, nil, err
+			}
+
+			// Discard any excess data copied in due to avail being reduced due
+			// to a simultaneous write call to the socket.
+			if avail < len(v) {
+				v = v[:avail]
+			}
+			// Add data to the send queue.
+			s := newSegmentFromView(&e.route, e.ID, v)
+			e.sndBufUsed += len(v)
+			e.sndBufInQueue += seqnum.Size(len(v))
+			e.sndQueue.PushBack(s)
 			e.sndBufMu.Unlock()
+			// Release the endpoint lock to prevent deadlocks due to lock
+			// order inversion when acquiring workMu.
 			e.mu.RUnlock()
-			e.stats.WriteErrors.WriteClosed.Increment()
-			return 0, nil, err
-		}
 
-		// Discard any excess data copied in due to avail being reduced due
-		// to a simultaneous write call to the socket.
-		if avail < len(v) {
-			v = v[:avail]
 		}
-	}
-
-	// Add data to the send queue.
-	s := newSegmentFromView(&e.route, e.ID, v)
-	e.sndBufUsed += len(v)
-	e.sndBufInQueue += seqnum.Size(len(v))
-	e.sndQueue.PushBack(s)
-	e.sndBufMu.Unlock()
-	// Release the endpoint lock to prevent deadlocks due to lock
-	// order inversion when acquiring workMu.
-	e.mu.RUnlock()
-
-	if e.workMu.TryLock() {
 		// Do the work inline.
 		e.handleWrite()
 		e.workMu.Unlock()
 	} else {
+		if !opts.Atomic { // See above.
+			e.mu.RLock()
+			e.sndBufMu.Lock()
+
+			// Because we released the lock before copying, check state again
+			// to make sure the endpoint is still in a valid state for a write.
+			avail, err = e.isEndpointWritableLocked()
+			if err != nil {
+				e.sndBufMu.Unlock()
+				e.mu.RUnlock()
+				e.stats.WriteErrors.WriteClosed.Increment()
+				return 0, nil, err
+			}
+
+			// Discard any excess data copied in due to avail being reduced due
+			// to a simultaneous write call to the socket.
+			if avail < len(v) {
+				v = v[:avail]
+			}
+			// Add data to the send queue.
+			s := newSegmentFromView(&e.route, e.ID, v)
+			e.sndBufUsed += len(v)
+			e.sndBufInQueue += seqnum.Size(len(v))
+			e.sndQueue.PushBack(s)
+			e.sndBufMu.Unlock()
+			// Release the endpoint lock to prevent deadlocks due to lock
+			// order inversion when acquiring workMu.
+			e.mu.RUnlock()
+
+		}
 		// Let the protocol goroutine do the work.
 		e.sndWaker.Assert()
 	}
@@ -1091,7 +1203,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data.
-	if s := e.state; !s.connected() && s != StateClose {
+	if s := e.EndpointState(); !s.connected() && s != StateClose {
 		if s == StateError {
 			return 0, tcpip.ControlMessages{}, e.HardError
 		}
@@ -1103,7 +1215,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	defer e.rcvListMu.Unlock()
 
 	if e.rcvBufUsed == 0 {
-		if e.rcvClosed || !e.state.connected() {
+		if e.rcvClosed || !e.EndpointState().connected() {
 			e.stats.ReadErrors.ReadClosed.Increment()
 			return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
 		}
@@ -1187,7 +1299,7 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		defer e.mu.Unlock()
 
 		// We only allow this to be set when we're in the initial state.
-		if e.state != StateInitial {
+		if e.EndpointState() != StateInitial {
 			return tcpip.ErrInvalidEndpointState
 		}
 
@@ -1402,14 +1514,14 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 				// Acquire the work mutex as we may need to
 				// reinitialize the congestion control state.
 				e.mu.Lock()
-				state := e.state
+				state := e.EndpointState()
 				e.cc = v
 				e.mu.Unlock()
 				switch state {
 				case StateEstablished:
 					e.workMu.Lock()
 					e.mu.Lock()
-					if e.state == state {
+					if e.EndpointState() == state {
 						e.snd.cc = e.snd.initCongestionControl(e.cc)
 					}
 					e.mu.Unlock()
@@ -1472,7 +1584,7 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
 	defer e.mu.RUnlock()
 
 	// The endpoint cannot be in listen state.
-	if e.state == StateListen {
+	if e.EndpointState() == StateListen {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
@@ -1731,7 +1843,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		return err
 	}
 
-	if e.state.connected() {
+	if e.EndpointState().connected() {
 		// The endpoint is already connected. If caller hasn't been
 		// notified yet, return success.
 		if !e.isConnectNotified {
@@ -1743,7 +1855,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	}
 
 	nicID := addr.NIC
-	switch e.state {
+	switch e.EndpointState() {
 	case StateBound:
 		// If we're already bound to a NIC but the caller is requesting
 		// that we use a different one now, we cannot proceed.
@@ -1850,7 +1962,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	}
 
 	e.isRegistered = true
-	e.state = StateConnecting
+	e.setEndpointState(StateConnecting)
 	e.route = r.Clone()
 	e.boundNICID = nicID
 	e.effectiveNetProtos = netProtos
@@ -1871,14 +1983,13 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		}
 		e.segmentQueue.mu.Unlock()
 		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
-		e.state = StateEstablished
-		e.stack.Stats().TCP.CurrentEstablished.Increment()
+		e.setEndpointState(StateEstablished)
 	}
 
 	if run {
 		e.workerRunning = true
 		e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
-		go e.protocolMainLoop(handshake) // S/R-SAFE: will be drained before save.
+		go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save.
 	}
 
 	return tcpip.ErrConnectStarted
@@ -1896,7 +2007,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 	e.shutdownFlags |= flags
 	finQueued := false
 	switch {
-	case e.state.connected():
+	case e.EndpointState().connected():
 		// Close for read.
 		if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
 			// Mark read side as closed.
@@ -1908,8 +2019,23 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			// If we're fully closed and we have unread data we need to abort
 			// the connection with a RST.
 			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
-				e.notifyProtocolGoroutine(notifyReset)
+				// Move the socket to error state immediately.
+				// This is done redundantly because in case of
+				// save/restore on a Shutdown/Close() the socket
+				// state needs to indicate the error otherwise
+				// save file will show the socket in established
+				// state even though snd/rcv are closed.
 				e.mu.Unlock()
+				// Try to send an active reset immediately if the
+				// work mutex is available.
+				if e.workMu.TryLock() {
+					e.mu.Lock()
+					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+					e.mu.Unlock()
+					e.workMu.Unlock()
+				} else {
+					e.notifyProtocolGoroutine(notifyReset)
+				}
 				return nil
 			}
 		}
@@ -1931,11 +2057,10 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			finQueued = true
 			// Mark endpoint as closed.
 			e.sndClosed = true
-
 			e.sndBufMu.Unlock()
 		}
 
-	case e.state == StateListen:
+	case e.EndpointState() == StateListen:
 		// Tell protocolListenLoop to stop.
 		if flags&tcpip.ShutdownRead != 0 {
 			e.notifyProtocolGoroutine(notifyClose)
@@ -1976,7 +2101,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	// When the endpoint shuts down, it sets workerCleanup to true, and from
 	// that point onward, acceptedChan is the responsibility of the cleanup()
 	// method (and should not be touched anywhere else, including here).
-	if e.state == StateListen && !e.workerCleanup {
+	if e.EndpointState() == StateListen && !e.workerCleanup {
 		// Adjust the size of the channel iff we can fix existing
 		// pending connections into the new one.
 		if len(e.acceptedChan) > backlog {
@@ -1994,7 +2119,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 		return nil
 	}
 
-	if e.state == StateInitial {
+	if e.EndpointState() == StateInitial {
 		// The listen is called on an unbound socket, the socket is
 		// automatically bound to a random free port with the local
 		// address set to INADDR_ANY.
@@ -2004,7 +2129,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	}
 
 	// Endpoint must be bound before it can transition to listen mode.
-	if e.state != StateBound {
+	if e.EndpointState() != StateBound {
 		e.stats.ReadErrors.InvalidEndpointState.Increment()
 		return tcpip.ErrInvalidEndpointState
 	}
@@ -2015,24 +2140,27 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	}
 
 	e.isRegistered = true
-	e.state = StateListen
+	e.setEndpointState(StateListen)
+
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
 	e.workerRunning = true
-
 	go e.protocolListenLoop( // S/R-SAFE: drained on save.
 		seqnum.Size(e.receiveBufferAvailable()))
-
 	return nil
 }
 
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
 func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
+	e.mu.Lock()
 	e.waiterQueue = waiterQueue
 	e.workerRunning = true
-	go e.protocolMainLoop(false) // S/R-SAFE: drained on save.
+	e.mu.Unlock()
+	wakerInitDone := make(chan struct{})
+	go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save.
+	<-wakerInitDone
 }
 
 // Accept returns a new endpoint if a peer has established a connection
@@ -2042,7 +2170,7 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	defer e.mu.RUnlock()
 
 	// Endpoint must be in listen state before it can accept connections.
-	if e.state != StateListen {
+	if e.EndpointState() != StateListen {
 		return nil, nil, tcpip.ErrInvalidEndpointState
 	}
 
@@ -2069,7 +2197,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	// Don't allow binding once endpoint is not in the initial state
 	// anymore. This is because once the endpoint goes into a connected or
 	// listen state, it is already bound.
-	if e.state != StateInitial {
+	if e.EndpointState() != StateInitial {
 		return tcpip.ErrAlreadyBound
 	}
 
@@ -2131,7 +2259,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	}
 
 	// Mark endpoint as bound.
-	e.state = StateBound
+	e.setEndpointState(StateBound)
 
 	return nil
 }
@@ -2153,7 +2281,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
-	if !e.state.connected() {
+	if !e.EndpointState().connected() {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
 	}
 
@@ -2164,45 +2292,22 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	}, nil
 }
 
-// HandlePacket is called by the stack when new packets arrive to this transport
-// endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
-	s := newSegment(r, id, pkt)
-	if !s.parse() {
-		e.stack.Stats().MalformedRcvdPackets.Increment()
-		e.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
-		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
-		s.decRef()
-		return
-	}
-
-	if !s.csumValid {
-		e.stack.Stats().MalformedRcvdPackets.Increment()
-		e.stack.Stats().TCP.ChecksumErrors.Increment()
-		e.stats.ReceiveErrors.ChecksumErrors.Increment()
-		s.decRef()
-		return
-	}
-
-	e.stack.Stats().TCP.ValidSegmentsReceived.Increment()
-	e.stats.SegmentsReceived.Increment()
-	if (s.flags & header.TCPFlagRst) != 0 {
-		e.stack.Stats().TCP.ResetsReceived.Increment()
-	}
-
-	e.enqueueSegment(s)
+	// TCP HandlePacket is not required anymore as inbound packets first
+	// land at the Dispatcher which then can either delivery using the
+	// worker go routine or directly do the invoke the tcp processing inline
+	// based on the state of the endpoint.
 }
 
-func (e *endpoint) enqueueSegment(s *segment) {
+func (e *endpoint) enqueueSegment(s *segment) bool {
 	// Send packet to worker goroutine.
-	if e.segmentQueue.enqueue(s) {
-		e.newSegmentWaker.Assert()
-	} else {
+	if !e.segmentQueue.enqueue(s) {
 		// The queue is full, so we drop the segment.
 		e.stack.Stats().DroppedPackets.Increment()
 		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
-		s.decRef()
+		return false
 	}
+	return true
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
@@ -2319,8 +2424,8 @@ func (e *endpoint) rcvWndScaleForHandshake() int {
 // updateRecentTimestamp updates the recent timestamp using the algorithm
 // described in https://tools.ietf.org/html/rfc7323#section-4.3
 func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
-	if e.sendTSOk && seqnum.Value(e.recentTS).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
-		e.recentTS = tsVal
+	if e.sendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
+		e.setRecentTimestamp(tsVal)
 	}
 }
 
@@ -2330,7 +2435,7 @@ func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value,
 func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
 	if synOpts.TS {
 		e.sendTSOk = true
-		e.recentTS = synOpts.TSVal
+		e.setRecentTimestamp(synOpts.TSVal)
 	}
 }
 
@@ -2419,7 +2524,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 
 	// Endpoint TCP Option state.
 	s.SendTSOk = e.sendTSOk
-	s.RecentTS = e.recentTS
+	s.RecentTS = e.recentTimestamp()
 	s.TSOffset = e.tsOffset
 	s.SACKPermitted = e.sackPermitted
 	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
@@ -2526,9 +2631,7 @@ func (e *endpoint) initGSO() {
 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
 // state for diagnostics.
 func (e *endpoint) State() uint32 {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	return uint32(e.state)
+	return uint32(e.EndpointState())
 }
 
 // Info returns a copy of the endpoint info.
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 4b8d867bc..4a46f0ec5 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -16,6 +16,7 @@ package tcp
 
 import (
 	"fmt"
+	"sync/atomic"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sync"
@@ -48,7 +49,7 @@ func (e *endpoint) beforeSave() {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	switch e.state {
+	switch e.EndpointState() {
 	case StateInitial, StateBound:
 		// TODO(b/138137272): this enumeration duplicates
 		// EndpointState.connected. remove it.
@@ -70,31 +71,30 @@ func (e *endpoint) beforeSave() {
 		fallthrough
 	case StateListen, StateConnecting:
 		e.drainSegmentLocked()
-		if e.state != StateClose && e.state != StateError {
+		if e.EndpointState() != StateClose && e.EndpointState() != StateError {
 			if !e.workerRunning {
 				panic("endpoint has no worker running in listen, connecting, or connected state")
 			}
 			break
 		}
-		fallthrough
 	case StateError, StateClose:
-		for (e.state == StateError || e.state == StateClose) && e.workerRunning {
+		for e.workerRunning {
 			e.mu.Unlock()
 			time.Sleep(100 * time.Millisecond)
 			e.mu.Lock()
 		}
 		if e.workerRunning {
-			panic("endpoint still has worker running in closed or error state")
+			panic(fmt.Sprintf("endpoint: %+v still has worker running in closed or error state", e.ID))
 		}
 	default:
-		panic(fmt.Sprintf("endpoint in unknown state %v", e.state))
+		panic(fmt.Sprintf("endpoint in unknown state %v", e.EndpointState()))
 	}
 
 	if e.waiterQueue != nil && !e.waiterQueue.IsEmpty() {
 		panic("endpoint still has waiters upon save")
 	}
 
-	if e.state != StateClose && !((e.state == StateBound || e.state == StateListen) == e.isPortReserved) {
+	if e.EndpointState() != StateClose && !((e.EndpointState() == StateBound || e.EndpointState() == StateListen) == e.isPortReserved) {
 		panic("endpoints which are not in the closed state must have a reserved port IFF they are in bound or listen state")
 	}
 }
@@ -135,7 +135,7 @@ func (e *endpoint) loadAcceptedChan(acceptedEndpoints []*endpoint) {
 
 // saveState is invoked by stateify.
 func (e *endpoint) saveState() EndpointState {
-	return e.state
+	return e.EndpointState()
 }
 
 // Endpoint loading must be done in the following ordering by their state, to
@@ -151,7 +151,8 @@ var connectingLoading sync.WaitGroup
 func (e *endpoint) loadState(state EndpointState) {
 	// This is to ensure that the loading wait groups include all applicable
 	// endpoints before any asynchronous calls to the Wait() methods.
-	if state.connected() {
+	// For restore purposes we treat TimeWait like a connected endpoint.
+	if state.connected() || state == StateTimeWait {
 		connectedLoading.Add(1)
 	}
 	switch state {
@@ -160,13 +161,14 @@ func (e *endpoint) loadState(state EndpointState) {
 	case StateConnecting, StateSynSent, StateSynRecv:
 		connectingLoading.Add(1)
 	}
-	e.state = state
+	// Directly update the state here rather than using e.setEndpointState
+	// as the endpoint is still being loaded and the stack reference to increment
+	// metrics is not yet initialized.
+	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
 }
 
 // afterLoad is invoked by stateify.
 func (e *endpoint) afterLoad() {
-	// Freeze segment queue before registering to prevent any segments
-	// from being delivered while it is being restored.
 	e.origEndpointState = e.state
 	// Restore the endpoint to InitialState as it will be moved to
 	// its origEndpointState during Resume.
@@ -180,7 +182,6 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
 	e.workMu.Init()
 	state := e.origEndpointState
-
 	switch state {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
 		var ss SendBufferSizeOption
@@ -276,7 +277,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 				listenLoading.Wait()
 				connectingLoading.Wait()
 				bind()
-				e.state = StateClose
+				e.setEndpointState(StateClose)
 				tcpip.AsyncLoading.Done()
 			}()
 		}
@@ -288,6 +289,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
 	}
+
 }
 
 // saveLastError is invoked by stateify.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 9a8f64aa6..958c06fa7 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -21,6 +21,7 @@
 package tcp
 
 import (
+	"runtime"
 	"strings"
 	"time"
 
@@ -104,6 +105,7 @@ type protocol struct {
 	moderateReceiveBuffer      bool
 	tcpLingerTimeout           time.Duration
 	tcpTimeWaitTimeout         time.Duration
+	dispatcher                 *dispatcher
 }
 
 // Number returns the tcp protocol number.
@@ -134,6 +136,14 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 	return h.SourcePort(), h.DestinationPort(), nil
 }
 
+// QueuePacket queues packets targeted at an endpoint after hashing the packet
+// to a specific processing queue. Each queue is serviced by its own processor
+// goroutine which is responsible for dequeuing and doing full TCP dispatch of
+// the packet.
+func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+	p.dispatcher.queuePacket(r, ep, id, pkt)
+}
+
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
 //
@@ -330,5 +340,6 @@ func NewProtocol() stack.TransportProtocol {
 		availableCongestionControl: []string{ccReno, ccCubic},
 		tcpLingerTimeout:           DefaultTCPLingerTimeout,
 		tcpTimeWaitTimeout:         DefaultTCPTimeWaitTimeout,
+		dispatcher:                 newDispatcher(runtime.GOMAXPROCS(0)),
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 05c8488f8..958f03ac1 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -169,19 +169,19 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		// We just received a FIN, our next state depends on whether we sent a
 		// FIN already or not.
 		r.ep.mu.Lock()
-		switch r.ep.state {
+		switch r.ep.EndpointState() {
 		case StateEstablished:
-			r.ep.state = StateCloseWait
+			r.ep.setEndpointState(StateCloseWait)
 		case StateFinWait1:
 			if s.flagIsSet(header.TCPFlagAck) {
 				// FIN-ACK, transition to TIME-WAIT.
-				r.ep.state = StateTimeWait
+				r.ep.setEndpointState(StateTimeWait)
 			} else {
 				// Simultaneous close, expecting a final ACK.
-				r.ep.state = StateClosing
+				r.ep.setEndpointState(StateClosing)
 			}
 		case StateFinWait2:
-			r.ep.state = StateTimeWait
+			r.ep.setEndpointState(StateTimeWait)
 		}
 		r.ep.mu.Unlock()
 
@@ -205,16 +205,16 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 	// shutdown states.
 	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt {
 		r.ep.mu.Lock()
-		switch r.ep.state {
+		switch r.ep.EndpointState() {
 		case StateFinWait1:
-			r.ep.state = StateFinWait2
+			r.ep.setEndpointState(StateFinWait2)
 			// Notify protocol goroutine that we have received an
 			// ACK to our FIN so that it can start the FIN_WAIT2
 			// timer to abort connection if the other side does
 			// not close within 2MSL.
 			r.ep.notifyProtocolGoroutine(notifyClose)
 		case StateClosing:
-			r.ep.state = StateTimeWait
+			r.ep.setEndpointState(StateTimeWait)
 		case StateLastAck:
 			r.ep.transitionToStateCloseLocked()
 		}
@@ -267,7 +267,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 	switch state {
 	case StateCloseWait, StateClosing, StateLastAck:
 		if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
-			s.decRef()
 			// Just drop the segment as we have
 			// already received a FIN and this
 			// segment is after the sequence number
@@ -284,7 +283,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 		// trigger a RST.
 		endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
 		if rcvClosed && r.rcvNxt.LessThan(endDataSeq) {
-			s.decRef()
 			return true, tcpip.ErrConnectionAborted
 		}
 		if state == StateFinWait1 {
@@ -314,7 +312,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 		// the last actual data octet in a segment in
 		// which it occurs.
 		if closed && (!s.flagIsSet(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.rcvNxt+1) {
-			s.decRef()
 			return true, tcpip.ErrConnectionAborted
 		}
 	}
@@ -336,7 +333,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 // r as they arrive. It is called by the protocol main loop.
 func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 	r.ep.mu.RLock()
-	state := r.ep.state
+	state := r.ep.EndpointState()
 	closed := r.ep.closed
 	r.ep.mu.RUnlock()
 
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index fdff7ed81..b74b61e7d 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -705,17 +705,15 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		}
 		seg.flags = header.TCPFlagAck | header.TCPFlagFin
 		segEnd = seg.sequenceNumber.Add(1)
-		// Transition to FIN-WAIT1 state since we're initiating an active close.
-		s.ep.mu.Lock()
-		switch s.ep.state {
+		// Update the state to reflect that we have now
+		// queued a FIN.
+		switch s.ep.EndpointState() {
 		case StateCloseWait:
-			// We've already received a FIN and are now sending our own. The
-			// sender is now awaiting a final ACK for this FIN.
-			s.ep.state = StateLastAck
+			s.ep.setEndpointState(StateLastAck)
 		default:
-			s.ep.state = StateFinWait1
+			s.ep.setEndpointState(StateFinWait1)
 		}
-		s.ep.mu.Unlock()
+
 	} else {
 		// We're sending a non-FIN segment.
 		if seg.flags&header.TCPFlagFin != 0 {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 6edfa8dce..a9dfbe857 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -293,7 +293,6 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 		checker.SeqNum(uint32(c.IRS+1)),
 		checker.AckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
-
 	finHeaders := &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: context.StackPort,
@@ -459,6 +458,9 @@ func TestConnectResetAfterClose(t *testing.T) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
+				// RST is always generated with sndNxt which if the FIN
+				// has been sent will be 1 higher than the sequence number
+				// of the FIN itself.
 				checker.SeqNum(uint32(c.IRS)+2),
 				checker.AckNum(0),
 				checker.TCPFlags(header.TCPFlagRst),
@@ -1500,6 +1502,9 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+			// RST is always generated with sndNxt which if the FIN
+			// has been sent will be 1 higher than the sequence
+			// number of the FIN itself.
 			checker.SeqNum(uint32(c.IRS)+2),
 		))
 	// The RST puts the endpoint into an error state.
@@ -5441,6 +5446,7 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 		rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
 		packetsSent++
 	}
+
 	// Resume the worker so that it only sees the packets once all of them
 	// are waiting to be read.
 	worker.ResumeWork()
@@ -5508,7 +5514,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 	stk := c.Stack()
 	// Set lower limits for auto-tuning tests. This is required because the
 	// test stops the worker which can cause packets to be dropped because
-	// the segment queue holding unprocessed packets is limited to 500.
+	// the segment queue holding unprocessed packets is limited to 300.
 	const receiveBufferSize = 80 << 10 // 80KB.
 	const maxReceiveBufferSize = receiveBufferSize * 10
 	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{1, receiveBufferSize, maxReceiveBufferSize}); err != nil {
@@ -5563,6 +5569,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 			totalSent += mss
 			packetsSent++
 		}
+
 		// Resume it so that it only sees the packets once all of them
 		// are waiting to be read.
 		worker.ResumeWork()
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 5d114d460..2f9821555 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -533,7 +533,7 @@ TEST_P(SocketInetLoopbackTest, TCPFinWait2Test_NoRandomSave) {
 
   // Sleep for a little over the linger timeout to reduce flakiness in
   // save/restore tests.
-  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 2));
 
   ds.reset();
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 6b99c021d..33a5ac66c 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -814,6 +814,20 @@ TEST_P(TcpSocketTest, FullBuffer) {
   t_ = -1;
 }
 
+TEST_P(TcpSocketTest, PollAfterShutdown) {
+  ScopedThread client_thread([this]() {
+    EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceedsWithValue(0));
+    struct pollfd poll_fd = {s_, POLLIN | POLLERR | POLLHUP, 0};
+    EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+                SyscallSucceedsWithValue(1));
+  });
+
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceedsWithValue(0));
+  struct pollfd poll_fd = {t_, POLLIN | POLLERR | POLLHUP, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+              SyscallSucceedsWithValue(1));
+}
+
 TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) {
   // Initialize address to the loopback one.
   sockaddr_storage addr =
-- 
cgit v1.2.3


From c50efc8c700fa2628f1415daeeb3b382009eb1bb Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 16 Jan 2020 12:48:05 -0800
Subject: Disable xattr tests.

These can remain disabled until we actually support extended attributes.

The following modifications were also made:
1. Disable save/restore on tests that change file permissions. Restore will not
work properly for these tests, since it will try to open the file with
read-write after it has been read- or write-only.
2. Change user.abc to user.test.

PiperOrigin-RevId: 290123941
---
 test/syscalls/BUILD          |   5 --
 test/syscalls/linux/xattr.cc | 152 ++++++++++++-------------------------------
 2 files changed, 42 insertions(+), 115 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index a3a85917d..829693e8e 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -717,11 +717,6 @@ syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
 
 syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
 
-syscall_test(
-    add_overlay = True,
-    test = "//test/syscalls/linux:xattr_test",
-)
-
 go_binary(
     name = "syscall_test_runner",
     testonly = 1,
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 75740238c..b3bc3463e 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -59,7 +59,8 @@ TEST_F(XattrTest, XattrLargeName) {
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
 
-  // TODO(b/127675828): Support setxattr and getxattr.
+  // An xattr should be whitelisted before it can be accessed--do not allow
+  // arbitrary xattrs to be read/written in gVisor.
   if (!IsRunningOnGvisor()) {
     EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
                 SyscallSucceeds());
@@ -83,59 +84,53 @@ TEST_F(XattrTest, XattrInvalidPrefix) {
               SyscallFailsWithErrno(EOPNOTSUPP));
 }
 
-TEST_F(XattrTest, XattrReadOnly) {
+// Do not allow save/restore cycles after making the test file read-only, as
+// the restore will fail to open it with r/w permissions.
+TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
 
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
-                SyscallSucceeds());
-  }
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
 
+  DisableSave ds;
   ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IRUSR));
 
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
               SyscallFailsWithErrno(EACCES));
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    char buf = '-';
-    EXPECT_THAT(getxattr(path, name, &buf, size),
-                SyscallSucceedsWithValue(size));
-    EXPECT_EQ(buf, val);
-  }
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
 }
 
-TEST_F(XattrTest, XattrWriteOnly) {
+// Do not allow save/restore cycles after making the test file write-only, as
+// the restore will fail to open it with r/w permissions.
+TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
 
+  DisableSave ds;
   ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IWUSR));
 
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
-                SyscallSucceeds());
-  }
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
 }
 
 TEST_F(XattrTest, XattrTrustedWithNonadmin) {
-  // TODO(b/127675828): Support setxattr and getxattr.
+  // TODO(b/127675828): Support setxattr and getxattr with "trusted" prefix.
   SKIP_IF(IsRunningOnGvisor());
   SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
 
@@ -147,11 +142,8 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) {
 }
 
 TEST_F(XattrTest, XattrOnDirectory) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
               SyscallSucceeds());
   EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0),
@@ -159,13 +151,10 @@ TEST_F(XattrTest, XattrOnDirectory) {
 }
 
 TEST_F(XattrTest, XattrOnSymlink) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0),
               SyscallSucceeds());
   EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0),
@@ -173,7 +162,7 @@ TEST_F(XattrTest, XattrOnSymlink) {
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
-  char name[] = "user.abc";
+  const char name[] = "user.test";
 
   char char_device[] = "/dev/zero";
   EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0),
@@ -191,11 +180,8 @@ TEST_F(XattrTest, XattrOnInvalidFileTypes) {
 }
 
 TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   size_t size = 1;
   EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
@@ -209,11 +195,8 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, SetxattrZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
 
@@ -225,7 +208,7 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
 
   // Note that each particular fs implementation may stipulate a lower size
   // limit, in which case we actually may fail (e.g. error with ENOSPC) for
@@ -235,43 +218,29 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
   EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
               SyscallFailsWithErrno(E2BIG));
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(getxattr(path, name, nullptr, 0),
-                SyscallFailsWithErrno(ENODATA));
-  }
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
               SyscallFailsWithErrno(EFAULT));
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(getxattr(path, name, nullptr, 0),
-                SyscallFailsWithErrno(ENODATA));
-  }
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
 }
 
 TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
   std::fill(val.begin(), val.end(), 'a');
   size_t size = 1;
@@ -286,11 +255,8 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
               SyscallSucceeds());
@@ -304,11 +270,8 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithLarger) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
               SyscallSucceeds());
@@ -321,11 +284,8 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
 }
 
 TEST_F(XattrTest, SetxattrCreateFlag) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
               SyscallSucceeds());
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -335,11 +295,8 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceFlag) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
               SyscallFailsWithErrno(ENODATA));
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -357,11 +314,8 @@ TEST_F(XattrTest, SetxattrInvalidFlags) {
 }
 
 TEST_F(XattrTest, Getxattr) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   int val = 1234;
   size_t size = sizeof(val);
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
@@ -372,11 +326,8 @@ TEST_F(XattrTest, Getxattr) {
 }
 
 TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   size_t size = val.size();
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
@@ -387,11 +338,8 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, 1, /*flags=*/0), SyscallSucceeds());
 
@@ -405,11 +353,8 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
               SyscallSucceeds());
@@ -421,11 +366,8 @@ TEST_F(XattrTest, GetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrSizeTooLarge) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
               SyscallSucceeds());
@@ -440,11 +382,8 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, GetxattrNullValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
@@ -454,11 +393,8 @@ TEST_F(XattrTest, GetxattrNullValue) {
 }
 
 TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
   // Set value with zero size.
@@ -473,13 +409,9 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrNonexistentName) {
-  // TODO(b/127675828): Support getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  std::string name = "user.nonexistent";
-  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
-              SyscallFailsWithErrno(ENODATA));
+  const char name[] = "user.test";
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 82ae857877fdf3492f40bca87657a07892c3f59b Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 6 Dec 2019 06:29:24 +0000
Subject: Enable build of test/syscall tests on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I277d6c708bbf5c3edd7c3568941cfd01dc122e17
---
 test/syscalls/linux/BUILD       | 55 ++++++++++++++++++++++++++++++++++-------
 test/syscalls/linux/bad.cc      |  3 ++-
 test/syscalls/linux/chroot.cc   |  2 +-
 test/syscalls/linux/fork.cc     |  3 +++
 test/syscalls/linux/getdents.cc | 10 +++++++-
 test/syscalls/linux/preadv2.cc  |  2 ++
 test/syscalls/linux/proc.cc     |  2 +-
 test/syscalls/linux/pwritev2.cc |  2 ++
 test/syscalls/linux/seccomp.cc  |  5 ++++
 test/syscalls/linux/stat.cc     |  2 ++
 test/util/signal_util.h         | 14 +++++++++++
 test/util/test_util.cc          |  2 +-
 12 files changed, 88 insertions(+), 14 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 064ce8429..68dcc598b 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -19,6 +19,16 @@ exports_files(
     visibility = ["//:sandbox"],
 )
 
+config_setting(
+    name = "x86_64",
+    constraint_values = ["@bazel_tools//platforms:x86_64"],
+)
+
+config_setting(
+    name = "aarch64",
+    constraint_values = ["@bazel_tools//platforms:aarch64"],
+)
+
 cc_binary(
     name = "sigaltstack_check",
     testonly = 1,
@@ -197,7 +207,10 @@ cc_binary(
 cc_binary(
     name = "32bit_test",
     testonly = 1,
-    srcs = ["32bit.cc"],
+    srcs = select({
+	":x86_64": ["32bit.cc"],
+	":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:memory_util",
@@ -584,7 +597,10 @@ cc_binary(
 cc_binary(
     name = "exceptions_test",
     testonly = 1,
-    srcs = ["exceptions.cc"],
+    srcs = select({
+        ":x86_64": ["exceptions.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
@@ -640,7 +656,10 @@ cc_binary(
 cc_binary(
     name = "exec_binary_test",
     testonly = 1,
-    srcs = ["exec_binary.cc"],
+    srcs = select({
+        ":x86_64": ["exec_binary.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
@@ -811,7 +830,10 @@ cc_binary(
 cc_binary(
     name = "fpsig_fork_test",
     testonly = 1,
-    srcs = ["fpsig_fork.cc"],
+    srcs = select({
+        ":x86_64": ["fpsig_fork.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
@@ -825,7 +847,10 @@ cc_binary(
 cc_binary(
     name = "fpsig_nested_test",
     testonly = 1,
-    srcs = ["fpsig_nested.cc"],
+    srcs = select({
+        ":x86_64": ["fpsig_nested.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:test_main",
@@ -1440,7 +1465,10 @@ cc_binary(
 cc_binary(
     name = "arch_prctl_test",
     testonly = 1,
-    srcs = ["arch_prctl.cc"],
+    srcs = select({
+        ":x86_64": ["arch_prctl.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:test_main",
@@ -2035,7 +2063,10 @@ cc_binary(
 cc_binary(
     name = "sigiret_test",
     testonly = 1,
-    srcs = ["sigiret.cc"],
+    srcs = select({
+        ":x86_64": ["sigiret.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
@@ -2043,7 +2074,10 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:timer_util",
         "@com_google_googletest//:gtest",
-    ],
+    ] + select({
+        ":x86_64": [],
+        ":aarch64": ["//test/util:test_main"],
+	}),
 )
 
 cc_binary(
@@ -3260,7 +3294,10 @@ cc_binary(
 cc_binary(
     name = "sysret_test",
     testonly = 1,
-    srcs = ["sysret.cc"],
+    srcs = select({
+        ":x86_64": ["sysret.cc"],
+        ":aarch64": [],
+    }),
     linkstatic = 1,
     deps = [
         "//test/util:logging",
diff --git a/test/syscalls/linux/bad.cc b/test/syscalls/linux/bad.cc
index f246a799e..9e4d8ea57 100644
--- a/test/syscalls/linux/bad.cc
+++ b/test/syscalls/linux/bad.cc
@@ -22,12 +22,13 @@ namespace gvisor {
 namespace testing {
 
 namespace {
-
+#if defined(__x86_64__)
 TEST(BadSyscallTest, NotImplemented) {
   // get_kernel_syms is not supported in Linux > 2.6, and not implemented in
   // gVisor.
   EXPECT_THAT(syscall(SYS_get_kernel_syms), SyscallFailsWithErrno(ENOSYS));
 }
+#endif // defined(__x86_64__)
 
 TEST(BadSyscallTest, NegativeOne) {
   EXPECT_THAT(syscall(-1), SyscallFailsWithErrno(ENOSYS));
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 04bc2d7b9..0a2d44a2c 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -162,7 +162,7 @@ TEST(ChrootTest, DotDotFromOpenFD) {
 
   // getdents on fd should not error.
   char buf[1024];
-  ASSERT_THAT(syscall(SYS_getdents, fd.get(), buf, sizeof(buf)),
+  ASSERT_THAT(syscall(SYS_getdents64, fd.get(), buf, sizeof(buf)),
               SyscallSucceeds());
 }
 
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 371890110..906f3358d 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -215,6 +215,8 @@ TEST_F(ForkTest, PrivateMapping) {
   EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
 }
 
+// CPUID is x86 specific.
+#ifdef __x86_64__
 // Test that cpuid works after a fork.
 TEST_F(ForkTest, Cpuid) {
   pid_t child = Fork();
@@ -227,6 +229,7 @@ TEST_F(ForkTest, Cpuid) {
   }
   EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
 }
+#endif
 
 TEST_F(ForkTest, Mmap) {
   pid_t child = Fork();
diff --git a/test/syscalls/linux/getdents.cc b/test/syscalls/linux/getdents.cc
index ad2dbacb8..bfd18d4ff 100644
--- a/test/syscalls/linux/getdents.cc
+++ b/test/syscalls/linux/getdents.cc
@@ -228,19 +228,27 @@ class GetdentsTest : public ::testing::Test {
 
 // Multiple template parameters are not allowed, so we must use explicit
 // template specialization to set the syscall number.
+#ifdef __x86_64__
 template <>
 int GetdentsTest<struct linux_dirent>::SyscallNum() {
   return SYS_getdents;
 }
+#endif
 
 template <>
 int GetdentsTest<struct linux_dirent64>::SyscallNum() {
   return SYS_getdents64;
 }
 
-// Test both legacy getdents and getdents64.
+#ifdef __x86_64__
+// Test both legacy getdents and getdents64 on x86_64.
 typedef ::testing::Types<struct linux_dirent, struct linux_dirent64>
     GetdentsTypes;
+#elif __aarch64__
+// Test only getdents64 on arm64.
+typedef ::testing::Types<struct linux_dirent64>
+    GetdentsTypes;
+#endif
 TYPED_TEST_SUITE(GetdentsTest, GetdentsTypes);
 
 // N.B. TYPED_TESTs require explicitly using this-> to access members of
diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
index c9246367d..3eeaf6ad8 100644
--- a/test/syscalls/linux/preadv2.cc
+++ b/test/syscalls/linux/preadv2.cc
@@ -35,6 +35,8 @@ namespace {
 #ifndef SYS_preadv2
 #if defined(__x86_64__)
 #define SYS_preadv2 327
+#elif defined(__aarch64__)
+#define SYS_preadv2 286
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 8cf08991b..5b4f29cd9 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1986,7 +1986,7 @@ TEST(Proc, GetdentsEnoent) {
       },
       nullptr, nullptr));
   char buf[1024];
-  ASSERT_THAT(syscall(SYS_getdents, fd.get(), buf, sizeof(buf)),
+  ASSERT_THAT(syscall(SYS_getdents64, fd.get(), buf, sizeof(buf)),
               SyscallFailsWithErrno(ENOENT));
 }
 
diff --git a/test/syscalls/linux/pwritev2.cc b/test/syscalls/linux/pwritev2.cc
index 1dbc0d6df..3fe5a600f 100644
--- a/test/syscalls/linux/pwritev2.cc
+++ b/test/syscalls/linux/pwritev2.cc
@@ -34,6 +34,8 @@ namespace {
 #ifndef SYS_pwritev2
 #if defined(__x86_64__)
 #define SYS_pwritev2 328
+#elif defined(__aarch64__)
+#define SYS_pwritev2 287
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 7e41fe7d8..6d7e543b9 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -49,7 +49,12 @@ namespace testing {
 namespace {
 
 // A syscall not implemented by Linux that we don't expect to be called.
+#ifdef __x86_64__
 constexpr uint32_t kFilteredSyscall = SYS_vserver;
+#elif __aarch64__
+// Using arch_specific_syscalls which are not implemented on arm64.
+constexpr uint32_t kFilteredSyscall = SYS_arch_specific_syscall+15;
+#endif
 
 // Applies a seccomp-bpf filter that returns `filtered_result` for
 // `sysno` and allows all other syscalls. Async-signal-safe.
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 30de2f8ff..7a99f2636 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -557,6 +557,8 @@ TEST(SimpleStatTest, AnonDeviceAllocatesUniqueInodesAcrossSaveRestore) {
 #ifndef SYS_statx
 #if defined(__x86_64__)
 #define SYS_statx 332
+#elif defined(__aarch64__)
+#define SYS_statx 291
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/util/signal_util.h b/test/util/signal_util.h
index bcf85c337..e7b66aa51 100644
--- a/test/util/signal_util.h
+++ b/test/util/signal_util.h
@@ -85,6 +85,20 @@ inline void FixupFault(ucontext_t* ctx) {
   // The encoding is 0x48 0xab 0x00.
   ctx->uc_mcontext.gregs[REG_RIP] += 3;
 }
+#elif __aarch64__
+inline void Fault() {
+  // Zero and dereference x0.
+  asm("mov xzr, x0\r\n"
+      "str xzr, [x0]\r\n"
+      :
+      :
+      : "x0");
+}
+
+inline void FixupFault(ucontext_t* ctx) {
+  // Skip the bad instruction above.
+  ctx->uc_mcontext.pc += 4;
+}
 #endif
 
 }  // namespace testing
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 848504c88..a4f78eec2 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -76,7 +76,6 @@ bool IsRunningWithHostinet() {
       "xchg %%rdi, %%rbx\n"                \
       : "=a"(a), "=D"(b), "=c"(c), "=d"(d) \
       : "a"(a_inp), "2"(c_inp))
-#endif  // defined(__x86_64__)
 
 CPUVendor GetCPUVendor() {
   uint32_t eax, ebx, ecx, edx;
@@ -93,6 +92,7 @@ CPUVendor GetCPUVendor() {
   }
   return CPUVendor::kUnknownVendor;
 }
+#endif  // defined(__x86_64__)
 
 bool operator==(const KernelVersion& first, const KernelVersion& second) {
   return first.major == second.major && first.minor == second.minor &&
-- 
cgit v1.2.3


From 9073521098ee52cdda74a193565b7bbe75d8c35a Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 17 Jan 2020 13:31:26 -0800
Subject: Convert EventMask to uint64

It is used for signalfd where the maximum signal is 64.

PiperOrigin-RevId: 290331008
---
 pkg/waiter/waiter.go            |   2 +-
 test/syscalls/BUILD             |   2 +
 test/syscalls/linux/signalfd.cc | 118 ++++++++++++++++++++++++----------------
 3 files changed, 74 insertions(+), 48 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index f708e95fa..707eb085b 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -62,7 +62,7 @@ import (
 )
 
 // EventMask represents io events as used in the poll() syscall.
-type EventMask uint16
+type EventMask uint64
 
 // Events that waiters can wait on. The meaning is the same as those in the
 // poll() syscall.
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 829693e8e..90d52e73b 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -380,6 +380,8 @@ syscall_test(test = "//test/syscalls/linux:rseq_test")
 
 syscall_test(test = "//test/syscalls/linux:rtsignal_test")
 
+syscall_test(test = "//test/syscalls/linux:signalfd_test")
+
 syscall_test(test = "//test/syscalls/linux:sched_test")
 
 syscall_test(test = "//test/syscalls/linux:sched_yield_test")
diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc
index 09ecad34a..95be4b66c 100644
--- a/test/syscalls/linux/signalfd.cc
+++ b/test/syscalls/linux/signalfd.cc
@@ -39,6 +39,7 @@ namespace testing {
 namespace {
 
 constexpr int kSigno = SIGUSR1;
+constexpr int kSignoMax = 64;  // SIGRTMAX
 constexpr int kSignoAlt = SIGUSR2;
 
 // Returns a new signalfd.
@@ -51,41 +52,45 @@ inline PosixErrorOr<FileDescriptor> NewSignalFD(sigset_t* mask, int flags = 0) {
   return FileDescriptor(fd);
 }
 
-TEST(Signalfd, Basic) {
+class SignalfdTest : public ::testing::TestWithParam<int> {};
+
+TEST_P(SignalfdTest, Basic) {
+  int signo = GetParam();
   // Create the signalfd.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Deliver the blocked signal.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
 
   // We should now read the signal.
   struct signalfd_siginfo rbuf;
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 }
 
-TEST(Signalfd, MaskWorks) {
+TEST_P(SignalfdTest, MaskWorks) {
+  int signo = GetParam();
   // Create two signalfds with different masks.
   sigset_t mask1, mask2;
   sigemptyset(&mask1);
   sigemptyset(&mask2);
-  sigaddset(&mask1, kSigno);
+  sigaddset(&mask1, signo);
   sigaddset(&mask2, kSignoAlt);
   FileDescriptor fd1 = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask1, 0));
   FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask2, 0));
 
   // Deliver the two signals.
   const auto scoped_sigmask1 =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
   const auto scoped_sigmask2 =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSignoAlt));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
   ASSERT_THAT(tgkill(getpid(), gettid(), kSignoAlt), SyscallSucceeds());
 
   // We should see the signals on the appropriate signalfds.
@@ -98,7 +103,7 @@ TEST(Signalfd, MaskWorks) {
   EXPECT_EQ(rbuf2.ssi_signo, kSignoAlt);
   ASSERT_THAT(read(fd1.get(), &rbuf1, sizeof(rbuf1)),
               SyscallSucceedsWithValue(sizeof(rbuf1)));
-  EXPECT_EQ(rbuf1.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf1.ssi_signo, signo);
 }
 
 TEST(Signalfd, Cloexec) {
@@ -111,11 +116,12 @@ TEST(Signalfd, Cloexec) {
   EXPECT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
 }
 
-TEST(Signalfd, Blocking) {
+TEST_P(SignalfdTest, Blocking) {
+  int signo = GetParam();
   // Create the signalfd in blocking mode.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Shared tid variable.
@@ -136,7 +142,7 @@ TEST(Signalfd, Blocking) {
     struct signalfd_siginfo rbuf;
     ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
                 SyscallSucceedsWithValue(sizeof(rbuf)));
-    EXPECT_EQ(rbuf.ssi_signo, kSigno);
+    EXPECT_EQ(rbuf.ssi_signo, signo);
   });
 
   // Wait until blocked.
@@ -149,20 +155,21 @@ TEST(Signalfd, Blocking) {
   //
   // See gvisor.dev/issue/139.
   if (IsRunningOnGvisor()) {
-    ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+    ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
   } else {
-    ASSERT_THAT(tgkill(getpid(), tid, kSigno), SyscallSucceeds());
+    ASSERT_THAT(tgkill(getpid(), tid, signo), SyscallSucceeds());
   }
 
   // Ensure that it was received.
   t.Join();
 }
 
-TEST(Signalfd, ThreadGroup) {
+TEST_P(SignalfdTest, ThreadGroup) {
+  int signo = GetParam();
   // Create the signalfd in blocking mode.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Shared variable.
@@ -176,7 +183,7 @@ TEST(Signalfd, ThreadGroup) {
     struct signalfd_siginfo rbuf;
     ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
                 SyscallSucceedsWithValue(sizeof(rbuf)));
-    EXPECT_EQ(rbuf.ssi_signo, kSigno);
+    EXPECT_EQ(rbuf.ssi_signo, signo);
 
     // Wait for the other thread.
     absl::MutexLock ml(&mu);
@@ -185,7 +192,7 @@ TEST(Signalfd, ThreadGroup) {
   });
 
   // Deliver the signal to the threadgroup.
-  ASSERT_THAT(kill(getpid(), kSigno), SyscallSucceeds());
+  ASSERT_THAT(kill(getpid(), signo), SyscallSucceeds());
 
   // Wait for the first thread to process.
   {
@@ -194,13 +201,13 @@ TEST(Signalfd, ThreadGroup) {
   }
 
   // Deliver to the thread group again (other thread still exists).
-  ASSERT_THAT(kill(getpid(), kSigno), SyscallSucceeds());
+  ASSERT_THAT(kill(getpid(), signo), SyscallSucceeds());
 
   // Ensure that we can also receive it.
   struct signalfd_siginfo rbuf;
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 
   // Mark the test as done.
   {
@@ -212,11 +219,12 @@ TEST(Signalfd, ThreadGroup) {
   t.Join();
 }
 
-TEST(Signalfd, Nonblock) {
+TEST_P(SignalfdTest, Nonblock) {
+  int signo = GetParam();
   // Create the signalfd in non-blocking mode.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_NONBLOCK));
 
@@ -227,20 +235,21 @@ TEST(Signalfd, Nonblock) {
 
   // Block and deliver the signal.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
 
   // Ensure that a read actually works.
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 
   // Should block again.
   EXPECT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallFailsWithErrno(EWOULDBLOCK));
 }
 
-TEST(Signalfd, SetMask) {
+TEST_P(SignalfdTest, SetMask) {
+  int signo = GetParam();
   // Create the signalfd matching nothing.
   sigset_t mask;
   sigemptyset(&mask);
@@ -249,8 +258,8 @@ TEST(Signalfd, SetMask) {
 
   // Block and deliver a signal.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
 
   // We should have nothing.
   struct signalfd_siginfo rbuf;
@@ -258,29 +267,30 @@ TEST(Signalfd, SetMask) {
               SyscallFailsWithErrno(EWOULDBLOCK));
 
   // Change the signal mask.
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   ASSERT_THAT(signalfd(fd.get(), &mask, 0), SyscallSucceeds());
 
   // We should now have the signal.
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 }
 
-TEST(Signalfd, Poll) {
+TEST_P(SignalfdTest, Poll) {
+  int signo = GetParam();
   // Create the signalfd.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Block the signal, and start a thread to deliver it.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
   pid_t orig_tid = gettid();
   ScopedThread t([&] {
     absl::SleepFor(absl::Seconds(5));
-    ASSERT_THAT(tgkill(getpid(), orig_tid, kSigno), SyscallSucceeds());
+    ASSERT_THAT(tgkill(getpid(), orig_tid, signo), SyscallSucceeds());
   });
 
   // Start polling for the signal. We expect that it is not available at the
@@ -297,19 +307,18 @@ TEST(Signalfd, Poll) {
               SyscallSucceedsWithValue(sizeof(rbuf)));
 }
 
-TEST(Signalfd, KillStillKills) {
-  sigset_t mask;
-  sigemptyset(&mask);
-  sigaddset(&mask, SIGKILL);
-  FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_CLOEXEC));
-
-  // Just because there is a signalfd, we shouldn't see any change in behavior
-  // for unblockable signals. It's easier to test this with SIGKILL.
-  const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGKILL));
-  EXPECT_EXIT(tgkill(getpid(), gettid(), SIGKILL), KilledBySignal(SIGKILL), "");
+std::string PrintSigno(::testing::TestParamInfo<int> info) {
+  switch (info.param) {
+    case kSigno:
+      return "kSigno";
+    case kSignoMax:
+      return "kSignoMax";
+    default:
+      return absl::StrCat(info.param);
+  }
 }
+INSTANTIATE_TEST_SUITE_P(Signalfd, SignalfdTest,
+                         ::testing::Values(kSigno, kSignoMax), PrintSigno);
 
 TEST(Signalfd, Ppoll) {
   sigset_t mask;
@@ -328,6 +337,20 @@ TEST(Signalfd, Ppoll) {
               SyscallSucceedsWithValue(0));
 }
 
+TEST(Signalfd, KillStillKills) {
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGKILL);
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_CLOEXEC));
+
+  // Just because there is a signalfd, we shouldn't see any change in behavior
+  // for unblockable signals. It's easier to test this with SIGKILL.
+  const auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGKILL));
+  EXPECT_EXIT(tgkill(getpid(), gettid(), SIGKILL), KilledBySignal(SIGKILL), "");
+}
+
 }  // namespace
 
 }  // namespace testing
@@ -340,6 +363,7 @@ int main(int argc, char** argv) {
   sigset_t set;
   sigemptyset(&set);
   sigaddset(&set, gvisor::testing::kSigno);
+  sigaddset(&set, gvisor::testing::kSignoMax);
   sigaddset(&set, gvisor::testing::kSignoAlt);
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
-- 
cgit v1.2.3


From 2ba6198851dc1e293295d7cadf8c0ae456b68beb Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 21 Jan 2020 12:41:50 -0800
Subject: Add syscalls for lgetxattr, fgetxattr, lsetxattr, and fsetxattr.

Note that these simply will use the same logic as getxattr and setxattr, which
is not yet implemented for most filesystems.

PiperOrigin-RevId: 290800960
---
 pkg/sentry/syscalls/linux/linux64_amd64.go |   8 +-
 pkg/sentry/syscalls/linux/linux64_arm64.go |   8 +-
 pkg/sentry/syscalls/linux/sys_xattr.go     | 136 +++++++++++++++++++++--------
 test/syscalls/linux/xattr.cc               |  41 +++++++++
 4 files changed, 150 insertions(+), 43 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 6b2920900..c76771a54 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -229,11 +229,11 @@ var AMD64 = &kernel.SyscallTable{
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
 		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
 		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
 		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index c9629f6f3..d3587fda6 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -42,11 +42,11 @@ var ARM64 = &kernel.SyscallTable{
 		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		6:   syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		7:   syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
 		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
 		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		12:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 23d20da6f..e35c077d6 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -27,6 +27,40 @@ import (
 
 // GetXattr implements linux syscall getxattr(2).
 func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getXattrFromPath(t, args, true)
+}
+
+// LGetXattr implements linux syscall lgetxattr(2).
+func LGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getXattrFromPath(t, args, false)
+}
+
+// FGetXattr implements linux syscall fgetxattr(2).
+func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := uint64(args[3].SizeT())
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	n, value, err := getXattr(t, f.Dirent, nameAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if _, err := t.CopyOutBytes(valueAddr, []byte(value)); err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
 	pathAddr := args[0].Pointer()
 	nameAddr := args[1].Pointer()
 	valueAddr := args[2].Pointer()
@@ -38,29 +72,17 @@ func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	valueLen := 0
-	err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
-		// If getxattr(2) is called with size 0, the size of the value will be
-		// returned successfully even if it is nonzero. In that case, we need to
-		// retrieve the entire attribute value so we can return the correct size.
-		requestedSize := size
-		if size == 0 || size > linux.XATTR_SIZE_MAX {
-			requestedSize = linux.XATTR_SIZE_MAX
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
 		}
 
-		value, err := getXattr(t, d, dirPath, nameAddr, uint64(requestedSize))
+		n, value, err := getXattr(t, d, nameAddr, size)
+		valueLen = n
 		if err != nil {
 			return err
 		}
 
-		valueLen = len(value)
-		if uint64(valueLen) > requestedSize {
-			return syserror.ERANGE
-		}
-
-		// Skip copying out the attribute value if size is 0.
-		if size == 0 {
-			return nil
-		}
 		_, err = t.CopyOutBytes(valueAddr, []byte(value))
 		return err
 	})
@@ -71,29 +93,73 @@ func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 }
 
 // getXattr implements getxattr(2) from the given *fs.Dirent.
-func getXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr, size uint64) (string, error) {
-	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
-		return "", syserror.ENOTDIR
-	}
-
+func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64) (int, string, error) {
 	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
-		return "", err
+		return 0, "", err
 	}
 
 	name, err := copyInXattrName(t, nameAddr)
 	if err != nil {
-		return "", err
+		return 0, "", err
 	}
 
 	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
-		return "", syserror.EOPNOTSUPP
+		return 0, "", syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.GetXattr(t, name, size)
+	// If getxattr(2) is called with size 0, the size of the value will be
+	// returned successfully even if it is nonzero. In that case, we need to
+	// retrieve the entire attribute value so we can return the correct size.
+	requestedSize := size
+	if size == 0 || size > linux.XATTR_SIZE_MAX {
+		requestedSize = linux.XATTR_SIZE_MAX
+	}
+
+	value, err := d.Inode.GetXattr(t, name, requestedSize)
+	if err != nil {
+		return 0, "", err
+	}
+	n := len(value)
+	if uint64(n) > requestedSize {
+		return 0, "", syserror.ERANGE
+	}
+
+	// Don't copy out the attribute value if size is 0.
+	if size == 0 {
+		return n, "", nil
+	}
+	return n, value, nil
 }
 
 // SetXattr implements linux syscall setxattr(2).
 func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return setXattrFromPath(t, args, true)
+}
+
+// LSetXattr implements linux syscall lsetxattr(2).
+func LSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return setXattrFromPath(t, args, false)
+}
+
+// FSetXattr implements linux syscall fsetxattr(2).
+func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := uint64(args[3].SizeT())
+	flags := args[4].Uint()
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags)
+}
+
+func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
 	pathAddr := args[0].Pointer()
 	nameAddr := args[1].Pointer()
 	valueAddr := args[2].Pointer()
@@ -105,19 +171,19 @@ func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		return 0, nil, err
 	}
 
-	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
-		return 0, nil, syserror.EINVAL
-	}
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
 
-	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
-		return setXattr(t, d, dirPath, nameAddr, valueAddr, uint64(size), flags)
+		return setXattr(t, d, nameAddr, valueAddr, uint64(size), flags)
 	})
 }
 
 // setXattr implements setxattr(2) from the given *fs.Dirent.
-func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error {
-	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
-		return syserror.ENOTDIR
+func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error {
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return syserror.EINVAL
 	}
 
 	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
@@ -133,7 +199,7 @@ func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr us
 		return syserror.E2BIG
 	}
 	buf := make([]byte, size)
-	if _, err = t.CopyInBytes(valueAddr, buf); err != nil {
+	if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
 		return err
 	}
 	value := string(buf)
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index b3bc3463e..e77c355d7 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -26,6 +26,7 @@
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -414,6 +415,46 @@ TEST_F(XattrTest, GetxattrNonexistentName) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
+TEST_F(XattrTest, LGetSetxattrOnSymlink) {
+  TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
+
+  EXPECT_THAT(lsetxattr(link.path().c_str(), nullptr, nullptr, 0, 0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(lgetxattr(link.path().c_str(), nullptr, nullptr, 0),
+              SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  int val = 1234;
+  size_t size = sizeof(val);
+  EXPECT_THAT(lsetxattr(path, name, &val, size, /*flags=*/0),
+              SyscallSucceeds());
+
+  int buf = 0;
+  EXPECT_THAT(lgetxattr(path, name, &buf, size),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, FGetSetxattr) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
+  const char name[] = "user.test";
+  int val = 1234;
+  size_t size = sizeof(val);
+  EXPECT_THAT(fsetxattr(fd.get(), name, &val, size, /*flags=*/0),
+              SyscallSucceeds());
+
+  int buf = 0;
+  EXPECT_THAT(fgetxattr(fd.get(), name, &buf, size),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From cbc0a92276b75e744511a43a9c0b78fc64946ec6 Mon Sep 17 00:00:00 2001
From: Ryan Heacock <rheacock@google.com>
Date: Tue, 21 Jan 2020 14:15:01 -0800
Subject: Correct todos referencing IPV6_RECVTCLASS

Bug 68320120 was revived because TODOs referenced the IP_RECVTOS bug instead
of the IPV6_RECVTCLASS bug.

PiperOrigin-RevId: 290820178
---
 test/syscalls/linux/udp_socket_test_cases.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 68e0a8109..a2f6ef8cc 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,7 +1349,7 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
   SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
           !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
@@ -1422,7 +1422,7 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
   // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From 2296b4734462b6eeef383ea58e2b1b0b1a214d76 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 21 Jan 2020 16:16:51 -0800
Subject: Change to standard types.

PiperOrigin-RevId: 290846481
---
 test/syscalls/linux/aio.cc                         |  2 +-
 test/syscalls/linux/chown.cc                       |  6 +-
 test/syscalls/linux/chroot.cc                      |  2 +-
 test/syscalls/linux/clock_gettime.cc               | 12 ++--
 test/syscalls/linux/eventfd.cc                     | 22 ++++----
 test/syscalls/linux/exceptions.cc                  | 66 +++++++++++-----------
 test/syscalls/linux/exec.cc                        | 10 ++--
 test/syscalls/linux/exec_binary.cc                 | 18 +++---
 test/syscalls/linux/fcntl.cc                       | 22 ++++----
 test/syscalls/linux/fork.cc                        |  2 +-
 test/syscalls/linux/futex.cc                       |  2 +-
 test/syscalls/linux/inotify.cc                     | 24 ++++----
 test/syscalls/linux/ip_socket_test_util.cc         |  4 +-
 test/syscalls/linux/ip_socket_test_util.h          |  4 +-
 test/syscalls/linux/itimer.cc                      |  4 +-
 test/syscalls/linux/kill.cc                        |  4 +-
 test/syscalls/linux/link.cc                        |  5 +-
 test/syscalls/linux/memfd.cc                       |  2 +-
 test/syscalls/linux/memory_accounting.cc           | 14 ++---
 test/syscalls/linux/mempolicy.cc                   | 28 ++++-----
 test/syscalls/linux/mmap.cc                        | 16 +++---
 test/syscalls/linux/open.cc                        |  2 +-
 test/syscalls/linux/partial_bad_buffer.cc          |  2 +-
 test/syscalls/linux/prctl_setuid.cc                |  2 +-
 test/syscalls/linux/proc.cc                        | 42 +++++++-------
 test/syscalls/linux/proc_net_tcp.cc                | 62 ++++++++++----------
 test/syscalls/linux/proc_net_udp.cc                | 32 +++++------
 test/syscalls/linux/proc_net_unix.cc               | 12 ++--
 test/syscalls/linux/proc_pid_uid_gid_map.cc        | 26 ++++-----
 test/syscalls/linux/ptrace.cc                      |  4 +-
 test/syscalls/linux/pty.cc                         | 14 ++---
 test/syscalls/linux/pwrite64.cc                    |  4 +-
 test/syscalls/linux/raw_socket_hdrincl.cc          |  4 +-
 test/syscalls/linux/rseq.cc                        |  2 +-
 test/syscalls/linux/rseq/critical.h                |  2 +-
 test/syscalls/linux/rseq/rseq.cc                   | 50 ++++++++--------
 test/syscalls/linux/rseq/types.h                   | 16 +++---
 test/syscalls/linux/seccomp.cc                     | 14 ++---
 test/syscalls/linux/semaphore.cc                   | 10 ++--
 test/syscalls/linux/shm.cc                         | 10 ++--
 test/syscalls/linux/sigaltstack.cc                 |  2 +-
 test/syscalls/linux/sigiret.cc                     | 14 ++---
 .../linux/socket_bind_to_device_distribution.cc    | 14 ++---
 test/syscalls/linux/socket_generic.cc              |  2 +-
 test/syscalls/linux/socket_inet_loopback.cc        | 56 +++++++++---------
 test/syscalls/linux/socket_ip_unbound.cc           |  8 +--
 test/syscalls/linux/socket_netdevice.cc            |  8 +--
 test/syscalls/linux/socket_netlink_route.cc        | 30 +++++-----
 test/syscalls/linux/socket_netlink_util.cc         |  4 +-
 test/syscalls/linux/socket_netlink_util.h          |  2 +-
 test/syscalls/linux/socket_test_util.cc            |  2 +-
 test/syscalls/linux/splice.cc                      |  2 +-
 test/syscalls/linux/stat.cc                        | 40 ++++++-------
 test/syscalls/linux/sticky.cc                      |  4 +-
 test/syscalls/linux/sysret.cc                      |  8 +--
 test/syscalls/linux/tcp_socket.cc                  |  2 +-
 test/syscalls/linux/time.cc                        |  4 +-
 test/syscalls/linux/timerfd.cc                     | 48 ++++++++--------
 test/syscalls/linux/udp_socket_test_cases.cc       | 14 ++---
 test/syscalls/linux/uidgid.cc                      |  8 +--
 test/syscalls/linux/utimes.cc                      | 25 ++++----
 test/syscalls/linux/vfork.cc                       | 14 ++---
 test/syscalls/linux/vsyscall.cc                    |  2 +-
 test/syscalls/linux/wait.cc                        | 18 +++---
 test/util/mount_util.h                             |  6 +-
 test/util/multiprocess_util.cc                     |  2 +-
 test/util/multiprocess_util.h                      |  5 +-
 test/util/proc_util.cc                             |  2 +-
 test/util/temp_path.cc                             |  2 +-
 test/util/test_util.cc                             | 20 +++----
 test/util/test_util.h                              | 12 ++--
 test/util/test_util_test.cc                        |  4 +-
 72 files changed, 483 insertions(+), 480 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index 28592bc8f..a33daff17 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -183,7 +183,7 @@ TEST_F(AIOTest, BadWrite) {
 
   // Verify that it fails with the right error code.
   EXPECT_EQ(events[0].data, 0x123);
-  EXPECT_EQ(events[0].obj, reinterpret_cast<uint64>(&cb));
+  EXPECT_EQ(events[0].obj, reinterpret_cast<uint64_t>(&cb));
   EXPECT_LT(events[0].res, 0);
 }
 
diff --git a/test/syscalls/linux/chown.cc b/test/syscalls/linux/chown.cc
index 1c00e2731..7a28b674d 100644
--- a/test/syscalls/linux/chown.cc
+++ b/test/syscalls/linux/chown.cc
@@ -31,9 +31,9 @@
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
-ABSL_FLAG(int32, scratch_uid1, 65534, "first scratch UID");
-ABSL_FLAG(int32, scratch_uid2, 65533, "second scratch UID");
-ABSL_FLAG(int32, scratch_gid, 65534, "first scratch GID");
+ABSL_FLAG(int32_t, scratch_uid1, 65534, "first scratch UID");
+ABSL_FLAG(int32_t, scratch_uid2, 65533, "second scratch UID");
+ABSL_FLAG(int32_t, scratch_gid, 65534, "first scratch GID");
 
 namespace gvisor {
 namespace testing {
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 27e057086..04bc2d7b9 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -253,7 +253,7 @@ TEST(ChrootTest, ProcMemSelfMapsNoEscapeProcOpen) {
   // Mmap the newly created file.
   void* foo_map = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
                        foo.get(), 0);
-  ASSERT_THAT(reinterpret_cast<int64>(foo_map), SyscallSucceeds());
+  ASSERT_THAT(reinterpret_cast<int64_t>(foo_map), SyscallSucceeds());
 
   // Always unmap.
   auto cleanup_map = Cleanup(
diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc
index 1d5b5af94..7f6015049 100644
--- a/test/syscalls/linux/clock_gettime.cc
+++ b/test/syscalls/linux/clock_gettime.cc
@@ -34,7 +34,7 @@ namespace testing {
 
 namespace {
 
-int64 clock_gettime_nsecs(clockid_t id) {
+int64_t clock_gettime_nsecs(clockid_t id) {
   struct timespec ts;
   TEST_PCHECK(clock_gettime(id, &ts) == 0);
   return (ts.tv_sec * 1000000000 + ts.tv_nsec);
@@ -42,9 +42,9 @@ int64 clock_gettime_nsecs(clockid_t id) {
 
 // Spin on the CPU for at least ns nanoseconds, based on
 // CLOCK_THREAD_CPUTIME_ID.
-void spin_ns(int64 ns) {
-  int64 start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
-  int64 end = start + ns;
+void spin_ns(int64_t ns) {
+  int64_t start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
+  int64_t end = start + ns;
 
   do {
     constexpr int kLoopCount = 1000000;  // large and arbitrary
@@ -64,7 +64,7 @@ TEST(ClockGettime, CputimeId) {
   // the workers. Note that we test CLOCK_PROCESS_CPUTIME_ID by having the
   // workers execute in parallel and verifying that CLOCK_PROCESS_CPUTIME_ID
   // accumulates the runtime of all threads.
-  int64 start = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
+  int64_t start = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
 
   // Create a kNumThreads threads.
   std::list<ScopedThread> threads;
@@ -76,7 +76,7 @@ TEST(ClockGettime, CputimeId) {
     t.Join();
   }
 
-  int64 end = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
+  int64_t end = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
 
   // The aggregate time spent in the worker threads must be at least
   // 'kNumThreads' times the time each thread spun.
diff --git a/test/syscalls/linux/eventfd.cc b/test/syscalls/linux/eventfd.cc
index fed67a56e..367682c3d 100644
--- a/test/syscalls/linux/eventfd.cc
+++ b/test/syscalls/linux/eventfd.cc
@@ -37,7 +37,7 @@ TEST(EventfdTest, Nonblock) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 l;
+  uint64_t l;
   ASSERT_THAT(read(efd.get(), &l, sizeof(l)), SyscallFailsWithErrno(EAGAIN));
 
   l = 1;
@@ -52,7 +52,7 @@ TEST(EventfdTest, Nonblock) {
 
 void* read_three_times(void* arg) {
   int efd = *reinterpret_cast<int*>(arg);
-  uint64 l;
+  uint64_t l;
   EXPECT_THAT(read(efd, &l, sizeof(l)), SyscallSucceedsWithValue(sizeof(l)));
   EXPECT_THAT(read(efd, &l, sizeof(l)), SyscallSucceedsWithValue(sizeof(l)));
   EXPECT_THAT(read(efd, &l, sizeof(l)), SyscallSucceedsWithValue(sizeof(l)));
@@ -68,7 +68,7 @@ TEST(EventfdTest, BlockingWrite) {
                              reinterpret_cast<void*>(&efd)),
               SyscallSucceeds());
 
-  uint64 l = 1;
+  uint64_t l = 1;
   ASSERT_THAT(write(efd, &l, sizeof(l)), SyscallSucceeds());
   EXPECT_EQ(l, 1);
 
@@ -85,7 +85,7 @@ TEST(EventfdTest, SmallWrite) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 l = 16;
+  uint64_t l = 16;
   ASSERT_THAT(write(efd.get(), &l, 4), SyscallFailsWithErrno(EINVAL));
 }
 
@@ -93,7 +93,7 @@ TEST(EventfdTest, SmallRead) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 l = 1;
+  uint64_t l = 1;
   ASSERT_THAT(write(efd.get(), &l, sizeof(l)), SyscallSucceeds());
 
   l = 0;
@@ -104,7 +104,7 @@ TEST(EventfdTest, BigWrite) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 big[16];
+  uint64_t big[16];
   big[0] = 16;
   ASSERT_THAT(write(efd.get(), big, sizeof(big)), SyscallSucceeds());
 }
@@ -113,10 +113,10 @@ TEST(EventfdTest, BigRead) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 l = 1;
+  uint64_t l = 1;
   ASSERT_THAT(write(efd.get(), &l, sizeof(l)), SyscallSucceeds());
 
-  uint64 big[16];
+  uint64_t big[16];
   ASSERT_THAT(read(efd.get(), big, sizeof(big)), SyscallSucceeds());
   EXPECT_EQ(big[0], 1);
 }
@@ -125,7 +125,7 @@ TEST(EventfdTest, BigWriteBigRead) {
   FileDescriptor efd =
       ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
 
-  uint64 l[16];
+  uint64_t l[16];
   l[0] = 16;
   ASSERT_THAT(write(efd.get(), l, sizeof(l)), SyscallSucceeds());
   ASSERT_THAT(read(efd.get(), l, sizeof(l)), SyscallSucceeds());
@@ -150,7 +150,7 @@ TEST(EventfdTest, NotifyNonZero_NoRandomSave) {
   int wait_out = epoll_wait(epollfd.get(), &out_ev, 1, kEpollTimeoutMs);
   EXPECT_EQ(wait_out, 1);
   EXPECT_EQ(efd.get(), out_ev.data.fd);
-  uint64 val = 0;
+  uint64_t val = 0;
   ASSERT_THAT(read(efd.get(), &val, sizeof(val)), SyscallSucceeds());
   EXPECT_EQ(val, 1);
 
@@ -159,7 +159,7 @@ TEST(EventfdTest, NotifyNonZero_NoRandomSave) {
   // epoll_wait times out.
   ScopedThread t([&efd] {
     sleep(5);
-    uint64 val = 1;
+    uint64_t val = 1;
     EXPECT_THAT(write(efd.get(), &val, sizeof(val)),
                 SyscallSucceedsWithValue(sizeof(val)));
   });
diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc
index 0b67eb0ad..3d564e720 100644
--- a/test/syscalls/linux/exceptions.cc
+++ b/test/syscalls/linux/exceptions.cc
@@ -24,20 +24,20 @@ namespace testing {
 
 // Default value for the x87 FPU control word. See Intel SDM Vol 1, Ch 8.1.5
 // "x87 FPU Control Word".
-constexpr uint16 kX87ControlWordDefault = 0x37f;
+constexpr uint16_t kX87ControlWordDefault = 0x37f;
 
 // Mask for the divide-by-zero exception.
-constexpr uint16 kX87ControlWordDiv0Mask = 1 << 2;
+constexpr uint16_t kX87ControlWordDiv0Mask = 1 << 2;
 
 // Default value for the SSE control register (MXCSR). See Intel SDM Vol 1, Ch
 // 11.6.4 "Initialization of SSE/SSE3 Extensions".
-constexpr uint32 kMXCSRDefault = 0x1f80;
+constexpr uint32_t kMXCSRDefault = 0x1f80;
 
 // Mask for the divide-by-zero exception.
-constexpr uint32 kMXCSRDiv0Mask = 1 << 9;
+constexpr uint32_t kMXCSRDiv0Mask = 1 << 9;
 
 // Flag for a pending divide-by-zero exception.
-constexpr uint32 kMXCSRDiv0Flag = 1 << 2;
+constexpr uint32_t kMXCSRDiv0Flag = 1 << 2;
 
 void inline Halt() { asm("hlt\r\n"); }
 
@@ -112,10 +112,10 @@ TEST(ExceptionTest, DivideByZero) {
 
   EXPECT_EXIT(
       {
-        uint32 remainder;
-        uint32 quotient;
-        uint32 divisor = 0;
-        uint64 value = 1;
+        uint32_t remainder;
+        uint32_t quotient;
+        uint32_t divisor = 0;
+        uint64_t value = 1;
         asm("divl 0(%2)\r\n"
             : "=d"(remainder), "=a"(quotient)
             : "r"(&divisor), "d"(value >> 32), "a"(value));
@@ -126,9 +126,9 @@ TEST(ExceptionTest, DivideByZero) {
 
 // By default, x87 exceptions are masked and simply return a default value.
 TEST(ExceptionTest, X87DivideByZeroMasked) {
-  int32 quotient;
-  int32 value = 1;
-  int32 divisor = 0;
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
   asm("fildl %[value]\r\n"
       "fidivl %[divisor]\r\n"
       "fistpl %[quotient]\r\n"
@@ -148,12 +148,12 @@ TEST(ExceptionTest, X87DivideByZeroUnmasked) {
   EXPECT_EXIT(
       {
         // Clear the divide by zero exception mask.
-        constexpr uint16 kControlWord =
+        constexpr uint16_t kControlWord =
             kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
 
-        int32 quotient;
-        int32 value = 1;
-        int32 divisor = 0;
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
         asm volatile(
             "fldcw %[cw]\r\n"
             "fildl %[value]\r\n"
@@ -176,12 +176,12 @@ TEST(ExceptionTest, X87StatusClobber) {
   EXPECT_EXIT(
       {
         // Clear the divide by zero exception mask.
-        constexpr uint16 kControlWord =
+        constexpr uint16_t kControlWord =
             kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
 
-        int32 quotient;
-        int32 value = 1;
-        int32 divisor = 0;
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
         asm volatile(
             "fildl %[value]\r\n"
             "fidivl %[divisor]\r\n"
@@ -208,10 +208,10 @@ TEST(ExceptionTest, X87StatusClobber) {
 
 // By default, SSE exceptions are masked and simply return a default value.
 TEST(ExceptionTest, SSEDivideByZeroMasked) {
-  uint32 status;
-  int32 quotient;
-  int32 value = 1;
-  int32 divisor = 0;
+  uint32_t status;
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
   asm("cvtsi2ssl %[value], %%xmm0\r\n"
       "cvtsi2ssl %[divisor], %%xmm1\r\n"
       "divss %%xmm1, %%xmm0\r\n"
@@ -233,11 +233,11 @@ TEST(ExceptionTest, SSEDivideByZeroUnmasked) {
   EXPECT_EXIT(
       {
         // Clear the divide by zero exception mask.
-        constexpr uint32 kMXCSR = kMXCSRDefault & ~kMXCSRDiv0Mask;
+        constexpr uint32_t kMXCSR = kMXCSRDefault & ~kMXCSRDiv0Mask;
 
-        int32 quotient;
-        int32 value = 1;
-        int32 divisor = 0;
+        int32_t quotient;
+        int32_t value = 1;
+        int32_t divisor = 0;
         asm volatile(
             "ldmxcsr %[mxcsr]\r\n"
             "cvtsi2ssl %[value], %%xmm0\r\n"
@@ -254,10 +254,10 @@ TEST(ExceptionTest, SSEDivideByZeroUnmasked) {
 
 // Pending exceptions in the SSE status register are not clobbered by syscalls.
 TEST(ExceptionTest, SSEStatusClobber) {
-  uint32 mxcsr;
-  int32 quotient;
-  int32 value = 1;
-  int32 divisor = 0;
+  uint32_t mxcsr;
+  int32_t quotient;
+  int32_t value = 1;
+  int32_t divisor = 0;
   asm("cvtsi2ssl %[value], %%xmm0\r\n"
       "cvtsi2ssl %[divisor], %%xmm1\r\n"
       "divss %%xmm1, %%xmm0\r\n"
@@ -336,7 +336,7 @@ TEST(ExceptionTest, AlignmentCheck) {
         SetAlignmentCheck();
         for (int i = 0; i < 8; i++) {
           // At least 7/8 offsets will be unaligned here.
-          uint64* ptr = reinterpret_cast<uint64*>(&array[i]);
+          uint64_t* ptr = reinterpret_cast<uint64_t*>(&array[i]);
           asm("mov %0, 0(%0)\r\n" : : "r"(ptr) : "ax");
         }
       },
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 9c5a11206..b5e0a512b 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -62,7 +62,7 @@ constexpr char kExecFromThread[] = "--exec_exec_from_thread";
 
 // Runs file specified by dirfd and pathname with argv and checks that the exit
 // status is expect_status and that stderr contains expect_stderr.
-void CheckExecHelper(const absl::optional<int32> dirfd,
+void CheckExecHelper(const absl::optional<int32_t> dirfd,
                      const std::string& pathname, const ExecveArray& argv,
                      const ExecveArray& envv, const int flags,
                      int expect_status, const std::string& expect_stderr) {
@@ -143,15 +143,15 @@ void CheckExecHelper(const absl::optional<int32> dirfd,
 void CheckExec(const std::string& filename, const ExecveArray& argv,
                const ExecveArray& envv, int expect_status,
                const std::string& expect_stderr) {
-  CheckExecHelper(/*dirfd=*/absl::optional<int32>(), filename, argv, envv,
+  CheckExecHelper(/*dirfd=*/absl::optional<int32_t>(), filename, argv, envv,
                   /*flags=*/0, expect_status, expect_stderr);
 }
 
-void CheckExecveat(const int32 dirfd, const std::string& pathname,
+void CheckExecveat(const int32_t dirfd, const std::string& pathname,
                    const ExecveArray& argv, const ExecveArray& envv,
                    const int flags, int expect_status,
                    const std::string& expect_stderr) {
-  CheckExecHelper(absl::optional<int32>(dirfd), pathname, argv, envv, flags,
+  CheckExecHelper(absl::optional<int32_t>(dirfd), pathname, argv, envv, flags,
                   expect_status, expect_stderr);
 }
 
@@ -603,7 +603,7 @@ TEST(ExecveatTest, AbsolutePathWithFDCWD) {
 TEST(ExecveatTest, AbsolutePath) {
   std::string path = RunfilePath(kBasicWorkload);
   // File descriptor should be ignored when an absolute path is given.
-  const int32 badFD = -1;
+  const int32_t badFD = -1;
   CheckExecveat(badFD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
                 absl::StrCat(path, "\n"));
 }
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 144bf45cf..736452b0c 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -700,7 +700,7 @@ TEST(ElfTest, PIE) {
 
   // The first segment really needs to start at 0 for a normal PIE binary, and
   // thus includes the headers.
-  const uint64 offset = elf.phdrs[1].p_offset;
+  const uint64_t offset = elf.phdrs[1].p_offset;
   elf.phdrs[1].p_offset = 0x0;
   elf.phdrs[1].p_vaddr = 0x0;
   elf.phdrs[1].p_filesz += offset;
@@ -720,7 +720,7 @@ TEST(ElfTest, PIE) {
   struct user_regs_struct regs;
   ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
 
-  const uint64 load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
 
   EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
                          // text page.
@@ -789,7 +789,7 @@ TEST(ElfTest, PIENonZeroStart) {
   struct user_regs_struct regs;
   ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
 
-  const uint64 load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
 
   // The ELF is loaded at an arbitrary address, not the first PT_LOAD vaddr.
   //
@@ -859,7 +859,7 @@ TEST(ElfTest, ELFInterpreter) {
 
   // The first segment really needs to start at 0 for a normal PIE binary, and
   // thus includes the headers.
-  uint64 const offset = interpreter.phdrs[1].p_offset;
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
   // N.B. Since Linux 4.10 (0036d1f7eb95b "binfmt_elf: fix calculations for bss
   // padding"), Linux unconditionally zeroes the remainder of the highest mapped
   // page in an interpreter, failing if the protections don't allow write. Thus
@@ -912,7 +912,7 @@ TEST(ElfTest, ELFInterpreter) {
   struct user_regs_struct regs;
   ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
 
-  const uint64 interp_load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
 
   EXPECT_THAT(
       child, ContainsMappings(std::vector<ProcMapsEntry>({
@@ -1047,7 +1047,7 @@ TEST(ElfTest, ELFInterpreterRelative) {
 
   // The first segment really needs to start at 0 for a normal PIE binary, and
   // thus includes the headers.
-  uint64 const offset = interpreter.phdrs[1].p_offset;
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
   // See comment in ElfTest.ELFInterpreter.
   interpreter.phdrs[1].p_flags = PF_R | PF_W | PF_X;
   interpreter.phdrs[1].p_offset = 0x0;
@@ -1086,7 +1086,7 @@ TEST(ElfTest, ELFInterpreterRelative) {
   struct user_regs_struct regs;
   ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
 
-  const uint64 interp_load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
 
   EXPECT_THAT(
       child, ContainsMappings(std::vector<ProcMapsEntry>({
@@ -1109,7 +1109,7 @@ TEST(ElfTest, ELFInterpreterWrongArch) {
 
   // The first segment really needs to start at 0 for a normal PIE binary, and
   // thus includes the headers.
-  uint64 const offset = interpreter.phdrs[1].p_offset;
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
   // See comment in ElfTest.ELFInterpreter.
   interpreter.phdrs[1].p_flags = PF_R | PF_W | PF_X;
   interpreter.phdrs[1].p_offset = 0x0;
@@ -1190,7 +1190,7 @@ TEST(ElfTest, ElfInterpreterNoExecute) {
 
   // The first segment really needs to start at 0 for a normal PIE binary, and
   // thus includes the headers.
-  uint64 const offset = interpreter.phdrs[1].p_offset;
+  uint64_t const offset = interpreter.phdrs[1].p_offset;
   // See comment in ElfTest.ELFInterpreter.
   interpreter.phdrs[1].p_flags = PF_R | PF_W | PF_X;
   interpreter.phdrs[1].p_offset = 0x0;
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 6eb597eae..4f3aa81d6 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -46,9 +46,9 @@ ABSL_FLAG(bool, blocking, false,
           "Whether to set a blocking lock (otherwise non-blocking).");
 ABSL_FLAG(bool, retry_eintr, false,
           "Whether to retry in the subprocess on EINTR.");
-ABSL_FLAG(uint64, child_setlock_start, 0, "The value of struct flock start");
-ABSL_FLAG(uint64, child_setlock_len, 0, "The value of struct flock len");
-ABSL_FLAG(int32, socket_fd, -1,
+ABSL_FLAG(uint64_t, child_setlock_start, 0, "The value of struct flock start");
+ABSL_FLAG(uint64_t, child_setlock_len, 0, "The value of struct flock len");
+ABSL_FLAG(int32_t, socket_fd, -1,
           "A socket to use for communicating more state back "
           "to the parent.");
 
@@ -71,8 +71,8 @@ class FcntlLockTest : public ::testing::Test {
     EXPECT_THAT(close(fds_[1]), SyscallSucceeds());
   }
 
-  int64 GetSubprocessFcntlTimeInUsec() {
-    int64 ret = 0;
+  int64_t GetSubprocessFcntlTimeInUsec() {
+    int64_t ret = 0;
     EXPECT_THAT(ReadFd(fds_[0], reinterpret_cast<void*>(&ret), sizeof(ret)),
                 SyscallSucceedsWithValue(sizeof(ret)));
     return ret;
@@ -676,7 +676,7 @@ TEST_F(FcntlLockTest, SetWriteLockThenBlockingWriteLock) {
   // We will wait kHoldLockForSec before we release our lock allowing the
   // subprocess to obtain it.
   constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
-  const int64 kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+  const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
 
   absl::SleepFor(kHoldLockFor);
 
@@ -685,7 +685,7 @@ TEST_F(FcntlLockTest, SetWriteLockThenBlockingWriteLock) {
   ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
 
   // Read the blocked time from the subprocess socket.
-  int64 subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+  int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
 
   // We must have been waiting at least kMinBlockTime.
   EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
@@ -729,7 +729,7 @@ TEST_F(FcntlLockTest, SetReadLockThenBlockingWriteLock) {
   // subprocess to obtain it.
   constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
 
-  const int64 kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+  const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
 
   absl::SleepFor(kHoldLockFor);
 
@@ -738,7 +738,7 @@ TEST_F(FcntlLockTest, SetReadLockThenBlockingWriteLock) {
   ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
 
   // Read the blocked time from the subprocess socket.
-  int64 subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+  int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
 
   // We must have been waiting at least kMinBlockTime.
   EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
@@ -782,7 +782,7 @@ TEST_F(FcntlLockTest, SetWriteLockThenBlockingReadLock) {
   // subprocess to obtain it.
   constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
 
-  const int64 kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+  const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
 
   absl::SleepFor(kHoldLockFor);
 
@@ -791,7 +791,7 @@ TEST_F(FcntlLockTest, SetWriteLockThenBlockingReadLock) {
   ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
 
   // Read the blocked time from the subprocess socket.
-  int64 subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+  int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
 
   // We must have been waiting at least kMinBlockTime.
   EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 486189697..371890110 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -270,7 +270,7 @@ TEST_F(ForkTest, Alarm) {
 
 // Child cannot affect parent private memory.
 TEST_F(ForkTest, PrivateMemory) {
-  std::atomic<uint32> local(0);
+  std::atomic<uint32_t> local(0);
 
   pid_t child1 = Fork();
   if (child1 == 0) {
diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc
index b4a7cc8d6..40c80a6e1 100644
--- a/test/syscalls/linux/futex.cc
+++ b/test/syscalls/linux/futex.cc
@@ -112,7 +112,7 @@ int futex_wake_bitset(bool priv, std::atomic<int>* uaddr, int count,
 }
 
 int futex_wake_op(bool priv, std::atomic<int>* uaddr1, std::atomic<int>* uaddr2,
-                  int nwake1, int nwake2, uint32 sub_op) {
+                  int nwake1, int nwake2, uint32_t sub_op) {
   int op = FUTEX_WAKE_OP;
   if (priv) {
     op |= FUTEX_PRIVATE_FLAG;
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 182d676d5..fdef646eb 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -48,26 +48,26 @@ constexpr int kBufSize = 1024;
 
 // C++-friendly version of struct inotify_event.
 struct Event {
-  int32 wd;
-  uint32 mask;
-  uint32 cookie;
-  uint32 len;
+  int32_t wd;
+  uint32_t mask;
+  uint32_t cookie;
+  uint32_t len;
   std::string name;
 
-  Event(uint32 mask, int32 wd, absl::string_view name, uint32 cookie)
+  Event(uint32_t mask, int32_t wd, absl::string_view name, uint32_t cookie)
       : wd(wd),
         mask(mask),
         cookie(cookie),
         len(name.size()),
         name(std::string(name)) {}
-  Event(uint32 mask, int32 wd, absl::string_view name)
+  Event(uint32_t mask, int32_t wd, absl::string_view name)
       : Event(mask, wd, name, 0) {}
-  Event(uint32 mask, int32 wd) : Event(mask, wd, "", 0) {}
+  Event(uint32_t mask, int32_t wd) : Event(mask, wd, "", 0) {}
   Event() : Event(0, 0, "", 0) {}
 };
 
 // Prints the symbolic name for a struct inotify_event's 'mask' field.
-std::string FlagString(uint32 flags) {
+std::string FlagString(uint32_t flags) {
   std::vector<std::string> names;
 
 #define EMIT(target)          \
@@ -320,7 +320,7 @@ PosixErrorOr<FileDescriptor> InotifyInit1(int flags) {
 }
 
 PosixErrorOr<int> InotifyAddWatch(int fd, const std::string& path,
-                                  uint32 mask) {
+                                  uint32_t mask) {
   int wd;
   EXPECT_THAT(wd = inotify_add_watch(fd, path.c_str(), mask),
               SyscallSucceeds());
@@ -647,7 +647,7 @@ TEST(Inotify, MoveGeneratesEvents) {
            Event(IN_MOVED_TO, root_wd, Basename(newpath), events[1].cookie)}));
   EXPECT_NE(events[0].cookie, 0);
   EXPECT_EQ(events[0].cookie, events[1].cookie);
-  uint32 last_cookie = events[0].cookie;
+  uint32_t last_cookie = events[0].cookie;
 
   // Test move from root -> root/dir1.
   newpath = NewTempAbsPathInDir(dir1.path());
@@ -841,7 +841,7 @@ TEST(Inotify, ConcurrentThreadsGeneratingEvents) {
   }
 
   auto test_thread = [&files]() {
-    uint32 seed = time(nullptr);
+    uint32_t seed = time(nullptr);
     for (int i = 0; i < 20; i++) {
       const TempPath& file = files[rand_r(&seed) % files.size()];
       const FileDescriptor file_fd =
@@ -960,7 +960,7 @@ TEST(Inotify, BlockingReadOnInotifyFd) {
   t.Join();
 
   // Make sure the event we got back is sane.
-  uint32 event_mask;
+  uint32_t event_mask;
   memcpy(&event_mask, buf.data() + offsetof(struct inotify_event, mask),
          sizeof(event_mask));
   EXPECT_EQ(event_mask, IN_ACCESS);
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index f694a6360..6b472eb2f 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -24,12 +24,12 @@
 namespace gvisor {
 namespace testing {
 
-uint32 IPFromInetSockaddr(const struct sockaddr* addr) {
+uint32_t IPFromInetSockaddr(const struct sockaddr* addr) {
   auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr);
   return in_addr->sin_addr.s_addr;
 }
 
-uint16 PortFromInetSockaddr(const struct sockaddr* addr) {
+uint16_t PortFromInetSockaddr(const struct sockaddr* addr) {
   auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr);
   return ntohs(in_addr->sin_port);
 }
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 0eeca30dd..0f58e0f77 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -27,10 +27,10 @@ namespace gvisor {
 namespace testing {
 
 // Extracts the IP address from an inet sockaddr in network byte order.
-uint32 IPFromInetSockaddr(const struct sockaddr* addr);
+uint32_t IPFromInetSockaddr(const struct sockaddr* addr);
 
 // Extracts the port from an inet sockaddr in host byte order.
-uint16 PortFromInetSockaddr(const struct sockaddr* addr);
+uint16_t PortFromInetSockaddr(const struct sockaddr* addr);
 
 // InterfaceIndex returns the index of the named interface.
 PosixErrorOr<int> InterfaceIndex(std::string name);
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index 52ffbe89d..b77e4cbd1 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -177,8 +177,8 @@ SignalTestResult ItimerSignalTest(int id, clock_t main_clock,
   SignalTestResult result;
 
   // Wait for the workers to be done and collect their sample counts.
-  result.worker_samples.push_back(reinterpret_cast<int64>(th1.Join()));
-  result.worker_samples.push_back(reinterpret_cast<int64>(th2.Join()));
+  result.worker_samples.push_back(reinterpret_cast<int64_t>(th1.Join()));
+  result.worker_samples.push_back(reinterpret_cast<int64_t>(th2.Join()));
   cleanup_itimer.Release()();
   result.expected_total = (Now(main_clock) - start) / kPeriod;
   result.main_thread_samples = signal_test_num_samples.load();
diff --git a/test/syscalls/linux/kill.cc b/test/syscalls/linux/kill.cc
index a2247fdeb..db29bd59c 100644
--- a/test/syscalls/linux/kill.cc
+++ b/test/syscalls/linux/kill.cc
@@ -32,8 +32,8 @@
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
-ABSL_FLAG(int32, scratch_uid, 65534, "scratch UID");
-ABSL_FLAG(int32, scratch_gid, 65534, "scratch GID");
+ABSL_FLAG(int32_t, scratch_uid, 65534, "scratch UID");
+ABSL_FLAG(int32_t, scratch_gid, 65534, "scratch GID");
 
 using ::testing::Ge;
 
diff --git a/test/syscalls/linux/link.cc b/test/syscalls/linux/link.cc
index 108a0c23e..e74fa2ed5 100644
--- a/test/syscalls/linux/link.cc
+++ b/test/syscalls/linux/link.cc
@@ -32,7 +32,7 @@
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
-ABSL_FLAG(int32, scratch_uid, 65534, "scratch UID");
+ABSL_FLAG(int32_t, scratch_uid, 65534, "scratch UID");
 
 namespace gvisor {
 namespace testing {
@@ -55,7 +55,8 @@ TEST(LinkTest, CanCreateLinkFile) {
   const std::string newname = NewTempAbsPath();
 
   // Get the initial link count.
-  uint64 initial_link_count = ASSERT_NO_ERRNO_AND_VALUE(Links(oldfile.path()));
+  uint64_t initial_link_count =
+      ASSERT_NO_ERRNO_AND_VALUE(Links(oldfile.path()));
 
   EXPECT_THAT(link(oldfile.path().c_str(), newname.c_str()), SyscallSucceeds());
 
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
index e10f250d1..e57b49a4a 100644
--- a/test/syscalls/linux/memfd.cc
+++ b/test/syscalls/linux/memfd.cc
@@ -61,7 +61,7 @@ int memfd_create(const std::string& name, unsigned int flags) {
 }
 
 PosixErrorOr<FileDescriptor> MemfdCreate(const std::string& name,
-                                         uint32 flags) {
+                                         uint32_t flags) {
   int fd = memfd_create(name, flags);
   if (fd < 0) {
     return PosixError(
diff --git a/test/syscalls/linux/memory_accounting.cc b/test/syscalls/linux/memory_accounting.cc
index 987dbd151..94aea4077 100644
--- a/test/syscalls/linux/memory_accounting.cc
+++ b/test/syscalls/linux/memory_accounting.cc
@@ -33,7 +33,7 @@ using ::absl::StrFormat;
 
 // AnonUsageFromMeminfo scrapes the current anonymous memory usage from
 // /proc/meminfo and returns it in bytes.
-PosixErrorOr<uint64> AnonUsageFromMeminfo() {
+PosixErrorOr<uint64_t> AnonUsageFromMeminfo() {
   ASSIGN_OR_RETURN_ERRNO(auto meminfo, GetContents("/proc/meminfo"));
   std::vector<std::string> lines(absl::StrSplit(meminfo, '\n'));
 
@@ -47,7 +47,7 @@ PosixErrorOr<uint64> AnonUsageFromMeminfo() {
         absl::StrSplit(line, ' ', absl::SkipEmpty()));
     if (parts.size() == 3) {
       // The size is the second field, let's try to parse it as a number.
-      ASSIGN_OR_RETURN_ERRNO(auto anon_kb, Atoi<uint64>(parts[1]));
+      ASSIGN_OR_RETURN_ERRNO(auto anon_kb, Atoi<uint64_t>(parts[1]));
       return anon_kb * 1024;
     }
 
@@ -65,10 +65,10 @@ TEST(MemoryAccounting, AnonAccountingPreservedOnSaveRestore) {
   // the test.
   SKIP_IF(!IsRunningOnGvisor());
 
-  uint64 anon_initial = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+  uint64_t anon_initial = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
 
   // Cause some anonymous memory usage.
-  uint64 map_bytes = Megabytes(512);
+  uint64_t map_bytes = Megabytes(512);
   char* mem =
       static_cast<char*>(mmap(nullptr, map_bytes, PROT_READ | PROT_WRITE,
                               MAP_POPULATE | MAP_ANON | MAP_PRIVATE, -1, 0));
@@ -77,11 +77,11 @@ TEST(MemoryAccounting, AnonAccountingPreservedOnSaveRestore) {
 
   // Write something to each page to prevent them from being decommited on
   // S/R. Zero pages are dropped on save.
-  for (uint64 i = 0; i < map_bytes; i += kPageSize) {
+  for (uint64_t i = 0; i < map_bytes; i += kPageSize) {
     mem[i] = 'a';
   }
 
-  uint64 anon_after_alloc = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+  uint64_t anon_after_alloc = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
   EXPECT_THAT(anon_after_alloc,
               EquivalentWithin(anon_initial + map_bytes, 0.03));
 
@@ -90,7 +90,7 @@ TEST(MemoryAccounting, AnonAccountingPreservedOnSaveRestore) {
   MaybeSave();
 
   // Usage should remain the same across S/R.
-  uint64 anon_after_sr = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+  uint64_t anon_after_sr = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
   EXPECT_THAT(anon_after_sr, EquivalentWithin(anon_after_alloc, 0.03));
 }
 
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
index 46bbbc923..9d5f47651 100644
--- a/test/syscalls/linux/mempolicy.cc
+++ b/test/syscalls/linux/mempolicy.cc
@@ -43,12 +43,12 @@ namespace {
 #define MPOL_MF_MOVE (1 << 1)
 #define MPOL_MF_MOVE_ALL (1 << 2)
 
-int get_mempolicy(int *policy, uint64 *nmask, uint64 maxnode, void *addr,
+int get_mempolicy(int *policy, uint64_t *nmask, uint64_t maxnode, void *addr,
                   int flags) {
   return syscall(SYS_get_mempolicy, policy, nmask, maxnode, addr, flags);
 }
 
-int set_mempolicy(int mode, uint64 *nmask, uint64 maxnode) {
+int set_mempolicy(int mode, uint64_t *nmask, uint64_t maxnode) {
   return syscall(SYS_set_mempolicy, mode, nmask, maxnode);
 }
 
@@ -68,8 +68,8 @@ Cleanup ScopedMempolicy() {
 
 // Temporarily change the memory policy for the calling thread within the
 // caller's scope.
-PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64 *nmask,
-                                         uint64 maxnode) {
+PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64_t *nmask,
+                                         uint64_t maxnode) {
   if (set_mempolicy(mode, nmask, maxnode)) {
     return PosixError(errno, "set_mempolicy");
   }
@@ -78,7 +78,7 @@ PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64 *nmask,
 
 TEST(MempolicyTest, CheckDefaultPolicy) {
   int mode = 0;
-  uint64 nodemask = 0;
+  uint64_t nodemask = 0;
   ASSERT_THAT(get_mempolicy(&mode, &nodemask, sizeof(nodemask) * BITS_PER_BYTE,
                             nullptr, 0),
               SyscallSucceeds());
@@ -88,12 +88,12 @@ TEST(MempolicyTest, CheckDefaultPolicy) {
 }
 
 TEST(MempolicyTest, PolicyPreservedAfterSetMempolicy) {
-  uint64 nodemask = 0x1;
+  uint64_t nodemask = 0x1;
   auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(
       MPOL_BIND, &nodemask, sizeof(nodemask) * BITS_PER_BYTE));
 
   int mode = 0;
-  uint64 nodemask_after = 0x0;
+  uint64_t nodemask_after = 0x0;
   ASSERT_THAT(get_mempolicy(&mode, &nodemask_after,
                             sizeof(nodemask_after) * BITS_PER_BYTE, nullptr, 0),
               SyscallSucceeds());
@@ -118,7 +118,7 @@ TEST(MempolicyTest, PolicyPreservedAfterSetMempolicy) {
 
 TEST(MempolicyTest, SetMempolicyRejectsInvalidInputs) {
   auto cleanup = ScopedMempolicy();
-  uint64 nodemask;
+  uint64_t nodemask;
 
   if (IsRunningOnGvisor()) {
     // Invalid nodemask, we only support a single node on gvisor.
@@ -165,7 +165,7 @@ TEST(MempolicyTest, EmptyNodemaskOnSet) {
               SyscallFailsWithErrno(EINVAL));
   EXPECT_THAT(set_mempolicy(MPOL_PREFERRED, nullptr, 1), SyscallSucceeds());
 
-  uint64 nodemask = 0x1;
+  uint64_t nodemask = 0x1;
   EXPECT_THAT(set_mempolicy(MPOL_DEFAULT, &nodemask, 0),
               SyscallFailsWithErrno(EINVAL));
   EXPECT_THAT(set_mempolicy(MPOL_BIND, &nodemask, 0),
@@ -175,7 +175,7 @@ TEST(MempolicyTest, EmptyNodemaskOnSet) {
 }
 
 TEST(MempolicyTest, QueryAvailableNodes) {
-  uint64 nodemask = 0;
+  uint64_t nodemask = 0;
   ASSERT_THAT(
       get_mempolicy(nullptr, &nodemask, sizeof(nodemask) * BITS_PER_BYTE,
                     nullptr, MPOL_F_MEMS_ALLOWED),
@@ -197,8 +197,8 @@ TEST(MempolicyTest, QueryAvailableNodes) {
 }
 
 TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
-  uint64 dummy_stack_address;
-  auto dummy_heap_address = absl::make_unique<uint64>();
+  uint64_t dummy_stack_address;
+  auto dummy_heap_address = absl::make_unique<uint64_t>();
   int mode;
 
   for (auto ptr : {&dummy_stack_address, dummy_heap_address.get()}) {
@@ -228,7 +228,7 @@ TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
 
 TEST(MempolicyTest, GetMempolicyCanOmitPointers) {
   int mode;
-  uint64 nodemask;
+  uint64_t nodemask;
 
   // Omit nodemask pointer.
   ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, nullptr, 0), SyscallSucceeds());
@@ -249,7 +249,7 @@ TEST(MempolicyTest, GetMempolicyNextInterleaveNode) {
               SyscallFailsWithErrno(EINVAL));
 
   // Set default policy for thread to MPOL_INTERLEAVE.
-  uint64 nodemask = 0x1;
+  uint64_t nodemask = 0x1;
   auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(
       MPOL_INTERLEAVE, &nodemask, sizeof(nodemask) * BITS_PER_BYTE));
 
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 9b2270c8d..1c4d9f1c7 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -50,13 +50,13 @@ namespace testing {
 
 namespace {
 
-PosixErrorOr<int64> VirtualMemorySize() {
+PosixErrorOr<int64_t> VirtualMemorySize() {
   ASSIGN_OR_RETURN_ERRNO(auto contents, GetContents("/proc/self/statm"));
   std::vector<std::string> parts = absl::StrSplit(contents, ' ');
   if (parts.empty()) {
     return PosixError(EINVAL, "Unable to parse /proc/self/statm");
   }
-  ASSIGN_OR_RETURN_ERRNO(auto pages, Atoi<int64>(parts[0]));
+  ASSIGN_OR_RETURN_ERRNO(auto pages, Atoi<int64_t>(parts[0]));
   return pages * getpagesize();
 }
 
@@ -245,7 +245,7 @@ TEST_F(MMapTest, MapDevZeroSharedFdNoPersistence) {
   // Create a second mapping via the same fd.
   void* psec_map = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
                         dev_zero.get(), 0);
-  ASSERT_THAT(reinterpret_cast<int64>(psec_map), SyscallSucceeds());
+  ASSERT_THAT(reinterpret_cast<int64_t>(psec_map), SyscallSucceeds());
 
   // Always unmap.
   auto cleanup_psec_map = Cleanup(
@@ -690,10 +690,10 @@ TEST_F(MMapTest, ExceedLimitDataPrlimitPID) {
 }
 
 TEST_F(MMapTest, NoExceedLimitAS) {
-  constexpr uint64 kAllocBytes = 200 << 20;
+  constexpr uint64_t kAllocBytes = 200 << 20;
   // Add some headroom to the AS limit in case of e.g. unexpected stack
   // expansion.
-  constexpr uint64 kExtraASBytes = kAllocBytes + (20 << 20);
+  constexpr uint64_t kExtraASBytes = kAllocBytes + (20 << 20);
   static_assert(kAllocBytes < kExtraASBytes,
                 "test depends on allocation not exceeding AS limit");
 
@@ -708,10 +708,10 @@ TEST_F(MMapTest, NoExceedLimitAS) {
 }
 
 TEST_F(MMapTest, ExceedLimitAS) {
-  constexpr uint64 kAllocBytes = 200 << 20;
+  constexpr uint64_t kAllocBytes = 200 << 20;
   // Add some headroom to the AS limit in case of e.g. unexpected stack
   // expansion.
-  constexpr uint64 kExtraASBytes = 20 << 20;
+  constexpr uint64_t kExtraASBytes = 20 << 20;
   static_assert(kAllocBytes > kExtraASBytes,
                 "test depends on allocation exceeding AS limit");
 
@@ -1469,7 +1469,7 @@ TEST_F(MMapFileTest, InternalSigBusZeroing) {
               SyscallFailsWithErrno(EFAULT));
 }
 
-// Checks that mmaps with a length of uint64(-PAGE_SIZE + 1) or greater do not
+// Checks that mmaps with a length of uint64_t(-PAGE_SIZE + 1) or greater do not
 // induce a sentry panic (due to "rounding up" to 0).
 TEST_F(MMapTest, HugeLength) {
   EXPECT_THAT(Map(0, static_cast<uint64_t>(-kPageSize + 1), PROT_NONE,
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index a5e790729..267ae19f6 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -193,7 +193,7 @@ TEST_F(OpenTest, Fault) {
 
 TEST_F(OpenTest, AppendOnly) {
   // First write some data to the fresh file.
-  const int64 kBufSize = 1024;
+  const int64_t kBufSize = 1024;
   std::vector<char> buf(kBufSize, 'a');
 
   FileDescriptor fd0 = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc
index 55eb9361f..df7129acc 100644
--- a/test/syscalls/linux/partial_bad_buffer.cc
+++ b/test/syscalls/linux/partial_bad_buffer.cc
@@ -363,7 +363,7 @@ TEST_F(PartialBadBufferTest, SendMsgTCP) {
   // byte past the valid page and check that it triggers an EFAULT
   // correctly. Otherwise in gVisor the sendmsg call will just return with no
   // error with kPageSize bytes written successfully.
-  const uint32 buf_size = kPageSize + 1;
+  const uint32_t buf_size = kPageSize + 1;
   ASSERT_THAT(setsockopt(send_socket.get(), SOL_SOCKET, SO_SNDBUF, &buf_size,
                          sizeof(buf_size)),
               SyscallSucceedsWithValue(0));
diff --git a/test/syscalls/linux/prctl_setuid.cc b/test/syscalls/linux/prctl_setuid.cc
index ad39a8463..30f0d75b3 100644
--- a/test/syscalls/linux/prctl_setuid.cc
+++ b/test/syscalls/linux/prctl_setuid.cc
@@ -26,7 +26,7 @@
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
-ABSL_FLAG(int32, scratch_uid, 65534, "scratch UID");
+ABSL_FLAG(int32_t, scratch_uid, 65534, "scratch UID");
 // This flag is used to verify that after an exec PR_GET_KEEPCAPS
 // returns 0, the return code will be offset by kPrGetKeepCapsExitBase.
 ABSL_FLAG(bool, prctl_pr_get_keepcaps, false,
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 0d5899ec9..bf9bb45d3 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -463,12 +463,12 @@ std::string AnonymousMapsEntryForMapping(const Mapping& m, int prot) {
   return AnonymousMapsEntry(m.addr(), m.len(), prot);
 }
 
-PosixErrorOr<std::map<uint64, uint64>> ReadProcSelfAuxv() {
+PosixErrorOr<std::map<uint64_t, uint64_t>> ReadProcSelfAuxv() {
   std::string auxv_file;
   RETURN_IF_ERRNO(GetContents("/proc/self/auxv", &auxv_file));
   const Elf64_auxv_t* auxv_data =
       reinterpret_cast<const Elf64_auxv_t*>(auxv_file.data());
-  std::map<uint64, uint64> auxv_entries;
+  std::map<uint64_t, uint64_t> auxv_entries;
   for (int i = 0; auxv_data[i].a_type != AT_NULL; i++) {
     auto a_type = auxv_data[i].a_type;
     EXPECT_EQ(0, auxv_entries.count(a_type)) << "a_type: " << a_type;
@@ -877,7 +877,7 @@ TEST(ProcStat, Fields) {
 
     // All fields besides itime are valid base 10 numbers.
     for (size_t i = 1; i < fields.size(); i++) {
-      uint64 val;
+      uint64_t val;
       EXPECT_TRUE(absl::SimpleAtoi(fields[i], &val)) << proc_stat;
     }
   }
@@ -904,7 +904,7 @@ TEST(ProcLoadavg, Fields) {
   EXPECT_EQ(fields.size(), 6) << proc_loadvg;
 
   double val;
-  uint64 val2;
+  uint64_t val2;
   // First three fields are floating point numbers.
   EXPECT_TRUE(absl::SimpleAtod(fields[0], &val)) << proc_loadvg;
   EXPECT_TRUE(absl::SimpleAtod(fields[1], &val)) << proc_loadvg;
@@ -936,19 +936,19 @@ TEST_P(ProcPidStatTest, HasBasicFields) {
   // boot time will be very close, and the proc starttime field (which is the
   // delta of the two times) will be 0.  For that unfortunate reason, we can
   // only check that starttime >= 0, and not that it is strictly > 0.
-  uint64 starttime;
+  uint64_t starttime;
   ASSERT_TRUE(absl::SimpleAtoi(fields[21], &starttime));
   EXPECT_GE(starttime, 0);
 
-  uint64 vss;
+  uint64_t vss;
   ASSERT_TRUE(absl::SimpleAtoi(fields[22], &vss));
   EXPECT_GT(vss, 0);
 
-  uint64 rss;
+  uint64_t rss;
   ASSERT_TRUE(absl::SimpleAtoi(fields[23], &rss));
   EXPECT_GT(rss, 0);
 
-  uint64 rsslim;
+  uint64_t rsslim;
   ASSERT_TRUE(absl::SimpleAtoi(fields[24], &rsslim));
   EXPECT_GT(rsslim, 0);
 }
@@ -965,11 +965,11 @@ TEST_P(ProcPidStatmTest, HasBasicFields) {
   std::vector<std::string> fields = absl::StrSplit(proc_pid_statm, ' ');
   ASSERT_GE(fields.size(), 7);
 
-  uint64 vss;
+  uint64_t vss;
   ASSERT_TRUE(absl::SimpleAtoi(fields[0], &vss));
   EXPECT_GT(vss, 0);
 
-  uint64 rss;
+  uint64_t rss;
   ASSERT_TRUE(absl::SimpleAtoi(fields[1], &rss));
   EXPECT_GT(rss, 0);
 }
@@ -977,7 +977,7 @@ TEST_P(ProcPidStatmTest, HasBasicFields) {
 INSTANTIATE_TEST_SUITE_P(SelfAndNumericPid, ProcPidStatmTest,
                          ::testing::Values("self", absl::StrCat(getpid())));
 
-PosixErrorOr<uint64> CurrentRSS() {
+PosixErrorOr<uint64_t> CurrentRSS() {
   ASSIGN_OR_RETURN_ERRNO(auto proc_self_stat, GetContents("/proc/self/stat"));
   if (proc_self_stat.empty()) {
     return PosixError(EINVAL, "empty /proc/self/stat");
@@ -990,7 +990,7 @@ PosixErrorOr<uint64> CurrentRSS() {
         absl::StrCat("/proc/self/stat has too few fields: ", proc_self_stat));
   }
 
-  uint64 rss;
+  uint64_t rss;
   if (!absl::SimpleAtoi(fields[23], &rss)) {
     return PosixError(
         EINVAL, absl::StrCat("/proc/self/stat RSS field is not a number: ",
@@ -1002,14 +1002,14 @@ PosixErrorOr<uint64> CurrentRSS() {
 }
 
 // The size of mapping created by MapPopulateRSS.
-constexpr uint64 kMappingSize = 100 << 20;
+constexpr uint64_t kMappingSize = 100 << 20;
 
 // Tolerance on RSS comparisons to account for background thread mappings,
 // reclaimed pages, newly faulted pages, etc.
-constexpr uint64 kRSSTolerance = 5 << 20;
+constexpr uint64_t kRSSTolerance = 5 << 20;
 
 // Capture RSS before and after an anonymous mapping with passed prot.
-void MapPopulateRSS(int prot, uint64* before, uint64* after) {
+void MapPopulateRSS(int prot, uint64_t* before, uint64_t* after) {
   *before = ASSERT_NO_ERRNO_AND_VALUE(CurrentRSS());
 
   // N.B. The kernel asynchronously accumulates per-task RSS counters into the
@@ -1040,7 +1040,7 @@ void MapPopulateRSS(int prot, uint64* before, uint64* after) {
 
 // PROT_WRITE + MAP_POPULATE anonymous mappings are always committed.
 TEST(ProcSelfStat, PopulateWriteRSS) {
-  uint64 before, after;
+  uint64_t before, after;
   MapPopulateRSS(PROT_READ | PROT_WRITE, &before, &after);
 
   // Mapping is committed.
@@ -1049,7 +1049,7 @@ TEST(ProcSelfStat, PopulateWriteRSS) {
 
 // PROT_NONE + MAP_POPULATE anonymous mappings are never committed.
 TEST(ProcSelfStat, PopulateNoneRSS) {
-  uint64 before, after;
+  uint64_t before, after;
   MapPopulateRSS(PROT_NONE, &before, &after);
 
   // Mapping not committed.
@@ -1766,7 +1766,7 @@ TEST(ProcTask, VerifyTaskDirNlinks) {
 
   // Once we reach the test body, we can count on the thread count being stable
   // unless we spawn a new one.
-  uint64 initial_links = ASSERT_NO_ERRNO_AND_VALUE(Links("/proc/self/task"));
+  uint64_t initial_links = ASSERT_NO_ERRNO_AND_VALUE(Links("/proc/self/task"));
   ASSERT_GE(initial_links, 3);
 
   // For each new subtask, we should gain a new link.
@@ -1864,9 +1864,9 @@ TEST(ProcFilesystems, Bug65172365) {
 }
 
 TEST(ProcFilesystems, PresenceOfShmMaxMniAll) {
-  uint64 shmmax = 0;
-  uint64 shmall = 0;
-  uint64 shmmni = 0;
+  uint64_t shmmax = 0;
+  uint64_t shmall = 0;
+  uint64_t shmmni = 0;
   std::string proc_file;
   proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/shmmax"));
   ASSERT_FALSE(proc_file.empty());
diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc
index 77183420b..5b6e3e3cd 100644
--- a/test/syscalls/linux/proc_net_tcp.cc
+++ b/test/syscalls/linux/proc_net_tcp.cc
@@ -40,15 +40,15 @@ constexpr char kProcNetTCPHeader[] =
 
 // TCPEntry represents a single entry from /proc/net/tcp.
 struct TCPEntry {
-  uint32 local_addr;
-  uint16 local_port;
+  uint32_t local_addr;
+  uint16_t local_port;
 
-  uint32 remote_addr;
-  uint16 remote_port;
+  uint32_t remote_addr;
+  uint16_t remote_port;
 
-  uint64 state;
-  uint64 uid;
-  uint64 inode;
+  uint64_t state;
+  uint64_t uid;
+  uint64_t inode;
 };
 
 // Finds the first entry in 'entries' for which 'predicate' returns true.
@@ -69,8 +69,8 @@ bool FindBy(const std::vector<TCPEntry>& entries, TCPEntry* match,
 
 bool FindByLocalAddr(const std::vector<TCPEntry>& entries, TCPEntry* match,
                      const struct sockaddr* addr) {
-  uint32 host = IPFromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy(entries, match, [host, port](const TCPEntry& e) {
     return (e.local_addr == host && e.local_port == port);
   });
@@ -78,8 +78,8 @@ bool FindByLocalAddr(const std::vector<TCPEntry>& entries, TCPEntry* match,
 
 bool FindByRemoteAddr(const std::vector<TCPEntry>& entries, TCPEntry* match,
                       const struct sockaddr* addr) {
-  uint32 host = IPFromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy(entries, match, [host, port](const TCPEntry& e) {
     return (e.remote_addr == host && e.remote_port == port);
   });
@@ -131,8 +131,8 @@ PosixErrorOr<std::vector<TCPEntry>> ProcNetTCPEntries() {
     ASSIGN_OR_RETURN_ERRNO(entry.remote_port, AtoiBase(fields[4], 16));
 
     ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
-    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64>(fields[11]));
-    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64>(fields[13]));
+    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64_t>(fields[11]));
+    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64_t>(fields[13]));
 
     entries.push_back(entry);
   }
@@ -234,8 +234,8 @@ TEST(ProcNetTCP, State) {
   FileDescriptor accepted =
       ASSERT_NO_ERRNO_AND_VALUE(Accept(server->get(), nullptr, nullptr));
 
-  const uint32 accepted_local_host = IPFromInetSockaddr(&addr);
-  const uint16 accepted_local_port = PortFromInetSockaddr(&addr);
+  const uint32_t accepted_local_host = IPFromInetSockaddr(&addr);
+  const uint16_t accepted_local_port = PortFromInetSockaddr(&addr);
 
   entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries());
   TCPEntry accepted_entry;
@@ -258,14 +258,14 @@ constexpr char kProcNetTCP6Header[] =
 // TCP6Entry represents a single entry from /proc/net/tcp6.
 struct TCP6Entry {
   struct in6_addr local_addr;
-  uint16 local_port;
+  uint16_t local_port;
 
   struct in6_addr remote_addr;
-  uint16 remote_port;
+  uint16_t remote_port;
 
-  uint64 state;
-  uint64 uid;
-  uint64 inode;
+  uint64_t state;
+  uint64_t uid;
+  uint64_t inode;
 };
 
 bool IPv6AddrEqual(const struct in6_addr* a1, const struct in6_addr* a2) {
@@ -296,7 +296,7 @@ const struct in6_addr* IP6FromInetSockaddr(const struct sockaddr* addr) {
 bool FindByLocalAddr6(const std::vector<TCP6Entry>& entries, TCP6Entry* match,
                       const struct sockaddr* addr) {
   const struct in6_addr* local = IP6FromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy6(entries, match, [local, port](const TCP6Entry& e) {
     return (IPv6AddrEqual(&e.local_addr, local) && e.local_port == port);
   });
@@ -305,22 +305,22 @@ bool FindByLocalAddr6(const std::vector<TCP6Entry>& entries, TCP6Entry* match,
 bool FindByRemoteAddr6(const std::vector<TCP6Entry>& entries, TCP6Entry* match,
                        const struct sockaddr* addr) {
   const struct in6_addr* remote = IP6FromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy6(entries, match, [remote, port](const TCP6Entry& e) {
     return (IPv6AddrEqual(&e.remote_addr, remote) && e.remote_port == port);
   });
 }
 
 void ReadIPv6Address(std::string s, struct in6_addr* addr) {
-  uint32 a0, a1, a2, a3;
+  uint32_t a0, a1, a2, a3;
   const char* fmt = "%08X%08X%08X%08X";
   EXPECT_EQ(sscanf(s.c_str(), fmt, &a0, &a1, &a2, &a3), 4);
 
-  uint8* b = addr->s6_addr;
-  *((uint32*)&b[0]) = a0;
-  *((uint32*)&b[4]) = a1;
-  *((uint32*)&b[8]) = a2;
-  *((uint32*)&b[12]) = a3;
+  uint8_t* b = addr->s6_addr;
+  *((uint32_t*)&b[0]) = a0;
+  *((uint32_t*)&b[4]) = a1;
+  *((uint32_t*)&b[8]) = a2;
+  *((uint32_t*)&b[12]) = a3;
 }
 
 // Returns a parsed representation of /proc/net/tcp6 entries.
@@ -367,8 +367,8 @@ PosixErrorOr<std::vector<TCP6Entry>> ProcNetTCP6Entries() {
     ReadIPv6Address(fields[3], &entry.remote_addr);
     ASSIGN_OR_RETURN_ERRNO(entry.remote_port, AtoiBase(fields[4], 16));
     ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
-    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64>(fields[11]));
-    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64>(fields[13]));
+    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64_t>(fields[11]));
+    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64_t>(fields[13]));
 
     entries.push_back(entry);
   }
@@ -476,7 +476,7 @@ TEST(ProcNetTCP6, State) {
       ASSERT_NO_ERRNO_AND_VALUE(Accept(server->get(), nullptr, nullptr));
 
   const struct in6_addr* local = IP6FromInetSockaddr(addr);
-  const uint16 accepted_local_port = PortFromInetSockaddr(addr);
+  const uint16_t accepted_local_port = PortFromInetSockaddr(addr);
 
   entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCP6Entries());
   TCP6Entry accepted_entry;
diff --git a/test/syscalls/linux/proc_net_udp.cc b/test/syscalls/linux/proc_net_udp.cc
index 98c1e0cf1..786b4b4af 100644
--- a/test/syscalls/linux/proc_net_udp.cc
+++ b/test/syscalls/linux/proc_net_udp.cc
@@ -40,15 +40,15 @@ constexpr char kProcNetUDPHeader[] =
 
 // UDPEntry represents a single entry from /proc/net/udp.
 struct UDPEntry {
-  uint32 local_addr;
-  uint16 local_port;
+  uint32_t local_addr;
+  uint16_t local_port;
 
-  uint32 remote_addr;
-  uint16 remote_port;
+  uint32_t remote_addr;
+  uint16_t remote_port;
 
-  uint64 state;
-  uint64 uid;
-  uint64 inode;
+  uint64_t state;
+  uint64_t uid;
+  uint64_t inode;
 };
 
 std::string DescribeFirstInetSocket(const SocketPair& sockets) {
@@ -81,8 +81,8 @@ bool FindBy(const std::vector<UDPEntry>& entries, UDPEntry* match,
 
 bool FindByLocalAddr(const std::vector<UDPEntry>& entries, UDPEntry* match,
                      const struct sockaddr* addr) {
-  uint32 host = IPFromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy(entries, match, [host, port](const UDPEntry& e) {
     return (e.local_addr == host && e.local_port == port);
   });
@@ -90,14 +90,14 @@ bool FindByLocalAddr(const std::vector<UDPEntry>& entries, UDPEntry* match,
 
 bool FindByRemoteAddr(const std::vector<UDPEntry>& entries, UDPEntry* match,
                       const struct sockaddr* addr) {
-  uint32 host = IPFromInetSockaddr(addr);
-  uint16 port = PortFromInetSockaddr(addr);
+  uint32_t host = IPFromInetSockaddr(addr);
+  uint16_t port = PortFromInetSockaddr(addr);
   return FindBy(entries, match, [host, port](const UDPEntry& e) {
     return (e.remote_addr == host && e.remote_port == port);
   });
 }
 
-PosixErrorOr<uint64> InodeFromSocketFD(int fd) {
+PosixErrorOr<uint64_t> InodeFromSocketFD(int fd) {
   ASSIGN_OR_RETURN_ERRNO(struct stat s, Fstat(fd));
   if (!S_ISSOCK(s.st_mode)) {
     return PosixError(EINVAL, StrFormat("FD %d is not a socket", fd));
@@ -107,7 +107,7 @@ PosixErrorOr<uint64> InodeFromSocketFD(int fd) {
 
 PosixErrorOr<bool> FindByFD(const std::vector<UDPEntry>& entries,
                             UDPEntry* match, int fd) {
-  ASSIGN_OR_RETURN_ERRNO(uint64 inode, InodeFromSocketFD(fd));
+  ASSIGN_OR_RETURN_ERRNO(uint64_t inode, InodeFromSocketFD(fd));
   return FindBy(entries, match,
                 [inode](const UDPEntry& e) { return (e.inode == inode); });
 }
@@ -158,8 +158,8 @@ PosixErrorOr<std::vector<UDPEntry>> ProcNetUDPEntries() {
     ASSIGN_OR_RETURN_ERRNO(entry.remote_port, AtoiBase(fields[4], 16));
 
     ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
-    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64>(fields[11]));
-    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64>(fields[13]));
+    ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64_t>(fields[11]));
+    ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64_t>(fields[13]));
 
     // Linux shares internal data structures between TCP and UDP sockets. The
     // proc entries for UDP sockets share some fields with TCP sockets, but
@@ -267,7 +267,7 @@ TEST(ProcNetUDP, BoundEntry) {
   struct sockaddr addr;
   socklen_t len = sizeof(addr);
   ASSERT_THAT(getsockname(socket->get(), &addr, &len), SyscallSucceeds());
-  uint16 port = PortFromInetSockaddr(&addr);
+  uint16_t port = PortFromInetSockaddr(&addr);
 
   std::vector<UDPEntry> entries =
       ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
index 2fe63f215..66db0acaa 100644
--- a/test/syscalls/linux/proc_net_unix.cc
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -46,12 +46,12 @@ enum {
 // UnixEntry represents a single entry from /proc/net/unix.
 struct UnixEntry {
   uintptr_t addr;
-  uint64 refs;
-  uint64 protocol;
-  uint64 flags;
-  uint64 type;
-  uint64 state;
-  uint64 inode;
+  uint64_t refs;
+  uint64_t protocol;
+  uint64_t flags;
+  uint64_t type;
+  uint64_t state;
+  uint64_t inode;
   std::string path;
 };
 
diff --git a/test/syscalls/linux/proc_pid_uid_gid_map.cc b/test/syscalls/linux/proc_pid_uid_gid_map.cc
index 8e268ebd1..748f7be58 100644
--- a/test/syscalls/linux/proc_pid_uid_gid_map.cc
+++ b/test/syscalls/linux/proc_pid_uid_gid_map.cc
@@ -117,13 +117,13 @@ void DenyPidSetgroups(pid_t pid) {
 }
 
 // Returns a valid UID/GID that isn't id.
-uint32 another_id(uint32 id) { return (id + 1) % 65535; }
+uint32_t another_id(uint32_t id) { return (id + 1) % 65535; }
 
 struct TestParam {
   std::string desc;
   int cap;
   std::function<std::string(absl::string_view)> get_map_filename;
-  std::function<uint32()> get_current_id;
+  std::function<uint32_t()> get_current_id;
 };
 
 std::string DescribeTestParam(const ::testing::TestParamInfo<TestParam>& info) {
@@ -135,17 +135,17 @@ std::vector<TestParam> UidGidMapTestParams() {
                     [](absl::string_view pid) {
                       return absl::StrCat("/proc/", pid, "/uid_map");
                     },
-                    []() -> uint32 { return getuid(); }},
+                    []() -> uint32_t { return getuid(); }},
           TestParam{"GID", CAP_SETGID,
                     [](absl::string_view pid) {
                       return absl::StrCat("/proc/", pid, "/gid_map");
                     },
-                    []() -> uint32 { return getgid(); }}};
+                    []() -> uint32_t { return getgid(); }}};
 }
 
 class ProcUidGidMapTest : public ::testing::TestWithParam<TestParam> {
  protected:
-  uint32 CurrentID() { return GetParam().get_current_id(); }
+  uint32_t CurrentID() { return GetParam().get_current_id(); }
 };
 
 class ProcSelfUidGidMapTest : public ProcUidGidMapTest {
@@ -198,7 +198,7 @@ TEST_P(ProcSelfUidGidMapTest, IsInitiallyEmpty) {
 
 TEST_P(ProcSelfUidGidMapTest, IdentityMapOwnID) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
-  uint32 id = CurrentID();
+  uint32_t id = CurrentID();
   std::string line = absl::StrCat(id, " ", id, " 1");
   EXPECT_THAT(
       InNewUserNamespaceWithMapFD([&](int fd) {
@@ -213,7 +213,7 @@ TEST_P(ProcSelfUidGidMapTest, TrailingNewlineAndNULIgnored) {
   // and an invalid (incomplete) map entry are appended to the valid entry. The
   // newline should be accepted, and everything after the NUL should be ignored.
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
-  uint32 id = CurrentID();
+  uint32_t id = CurrentID();
   std::string line = absl::StrCat(id, " ", id, " 1\n\0 4 3");
   EXPECT_THAT(
       InNewUserNamespaceWithMapFD([&](int fd) {
@@ -227,8 +227,8 @@ TEST_P(ProcSelfUidGidMapTest, TrailingNewlineAndNULIgnored) {
 
 TEST_P(ProcSelfUidGidMapTest, NonIdentityMapOwnID) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
-  uint32 id = CurrentID();
-  uint32 id2 = another_id(id);
+  uint32_t id = CurrentID();
+  uint32_t id2 = another_id(id);
   std::string line = absl::StrCat(id2, " ", id, " 1");
   EXPECT_THAT(
       InNewUserNamespaceWithMapFD([&](int fd) {
@@ -243,8 +243,8 @@ TEST_P(ProcSelfUidGidMapTest, MapOtherID) {
   // Whether or not we have CAP_SET*ID is irrelevant: the process running in the
   // new (child) user namespace won't have any capabilities in the current
   // (parent) user namespace, which is needed.
-  uint32 id = CurrentID();
-  uint32 id2 = another_id(id);
+  uint32_t id = CurrentID();
+  uint32_t id2 = another_id(id);
   std::string line = absl::StrCat(id, " ", id2, " 1");
   EXPECT_THAT(InNewUserNamespaceWithMapFD([&](int fd) {
                 DenySelfSetgroups();
@@ -270,8 +270,8 @@ TEST_P(ProcPidUidGidMapTest, MapOtherIDPrivileged) {
   std::tie(child_pid, cleanup_child) =
       ASSERT_NO_ERRNO_AND_VALUE(CreateProcessInNewUserNamespace());
 
-  uint32 id = CurrentID();
-  uint32 id2 = another_id(id);
+  uint32_t id = CurrentID();
+  uint32_t id2 = another_id(id);
   std::string line = absl::StrCat(id, " ", id2, " 1");
   DenyPidSetgroups(child_pid);
   auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenMapFile(child_pid));
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 37dabb1ad..8f3800380 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -574,7 +574,7 @@ TEST_P(PtraceExecveTest, Execve_GetRegs_PeekUser_SIGKILL_TraceClone_TraceExit) {
 #ifdef __x86_64__
   {
     // CS should be 0x33, indicating an 64-bit binary.
-    constexpr uint64 kAMD64UserCS = 0x33;
+    constexpr uint64_t kAMD64UserCS = 0x33;
     EXPECT_THAT(ptrace(PTRACE_PEEKUSER, leader_tid,
                        offsetof(struct user_regs_struct, cs), 0),
                 SyscallSucceedsWithValue(kAMD64UserCS));
@@ -862,7 +862,7 @@ TEST(PtraceTest, Int3) {
 
 TEST(PtraceTest, Sysemu_PokeUser) {
   constexpr int kSysemuHelperFirstExitCode = 126;
-  constexpr uint64 kSysemuInjectedExitGroupReturn = 42;
+  constexpr uint64_t kSysemuInjectedExitGroupReturn = 42;
 
   pid_t const child_pid = fork();
   if (child_pid == 0) {
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index 5020372c1..dafe64d20 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -109,13 +109,13 @@ constexpr bool IsControlCharacter(char c) { return c <= 31; }
 
 struct Field {
   const char* name;
-  uint64 mask;
-  uint64 value;
+  uint64_t mask;
+  uint64_t value;
 };
 
 // ParseFields returns a string representation of value, using the names in
 // fields.
-std::string ParseFields(const Field* fields, size_t len, uint64 value) {
+std::string ParseFields(const Field* fields, size_t len, uint64_t value) {
   bool first = true;
   std::string s;
   for (size_t i = 0; i < len; i++) {
@@ -1213,8 +1213,8 @@ TEST_F(PtyTest, GetWindowSize) {
 }
 
 TEST_F(PtyTest, SetSlaveWindowSize) {
-  constexpr uint16 kRows = 343;
-  constexpr uint16 kCols = 2401;
+  constexpr uint16_t kRows = 343;
+  constexpr uint16_t kCols = 2401;
   struct winsize ws = {.ws_row = kRows, .ws_col = kCols};
   ASSERT_THAT(ioctl(slave_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
 
@@ -1226,8 +1226,8 @@ TEST_F(PtyTest, SetSlaveWindowSize) {
 }
 
 TEST_F(PtyTest, SetMasterWindowSize) {
-  constexpr uint16 kRows = 343;
-  constexpr uint16 kCols = 2401;
+  constexpr uint16_t kRows = 343;
+  constexpr uint16_t kCols = 2401;
   struct winsize ws = {.ws_row = kRows, .ws_col = kCols};
   ASSERT_THAT(ioctl(master_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
 
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index 18f847929..b48fe540d 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -52,7 +52,7 @@ class Pwrite64 : public ::testing::Test {
 TEST_F(Pwrite64, AppendOnly) {
   int fd;
   ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
-  constexpr int64 kBufSize = 1024;
+  constexpr int64_t kBufSize = 1024;
   std::vector<char> buf(kBufSize);
   std::fill(buf.begin(), buf.end(), 'a');
   EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), 0),
@@ -64,7 +64,7 @@ TEST_F(Pwrite64, AppendOnly) {
 TEST_F(Pwrite64, InvalidArgs) {
   int fd;
   ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
-  constexpr int64 kBufSize = 1024;
+  constexpr int64_t kBufSize = 1024;
   std::vector<char> buf(kBufSize);
   std::fill(buf.begin(), buf.end(), 'a');
   EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), -1),
diff --git a/test/syscalls/linux/raw_socket_hdrincl.cc b/test/syscalls/linux/raw_socket_hdrincl.cc
index 0c04b974e..0a27506aa 100644
--- a/test/syscalls/linux/raw_socket_hdrincl.cc
+++ b/test/syscalls/linux/raw_socket_hdrincl.cc
@@ -53,7 +53,7 @@ class RawHDRINCL : public ::testing::Test {
   // Fills in buf with an IP header, UDP header, and payload. Returns false if
   // buf_size isn't large enough to hold everything.
   bool FillPacket(char* buf, size_t buf_size, int port, const char* payload,
-                  uint16 payload_size);
+                  uint16_t payload_size);
 
   // The socket used for both reading and writing.
   int socket_;
@@ -104,7 +104,7 @@ struct iphdr RawHDRINCL::LoopbackHeader() {
 }
 
 bool RawHDRINCL::FillPacket(char* buf, size_t buf_size, int port,
-                            const char* payload, uint16 payload_size) {
+                            const char* payload, uint16_t payload_size) {
   if (buf_size < sizeof(struct iphdr) + sizeof(struct udphdr) + payload_size) {
     return false;
   }
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
index 9b2a76b91..106c045e3 100644
--- a/test/syscalls/linux/rseq.cc
+++ b/test/syscalls/linux/rseq.cc
@@ -43,7 +43,7 @@ namespace {
 // only be cleared by execve (or knowing the old rseq address), and glibc (based
 // on the current unmerged patches) register rseq before calling main()).
 
-int RSeq(struct rseq* rseq, uint32 rseq_len, int flags, uint32 sig) {
+int RSeq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
   return syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
 }
 
diff --git a/test/syscalls/linux/rseq/critical.h b/test/syscalls/linux/rseq/critical.h
index 238143fd0..ac987a25e 100644
--- a/test/syscalls/linux/rseq/critical.h
+++ b/test/syscalls/linux/rseq/critical.h
@@ -18,7 +18,7 @@
 #include "test/syscalls/linux/rseq/types.h"
 #include "test/syscalls/linux/rseq/uapi.h"
 
-constexpr uint32 kRseqSignature = 0x90909090;
+constexpr uint32_t kRseqSignature = 0x90909090;
 
 extern "C" {
 
diff --git a/test/syscalls/linux/rseq/rseq.cc b/test/syscalls/linux/rseq/rseq.cc
index 4fe7c5ecf..f036db26d 100644
--- a/test/syscalls/linux/rseq/rseq.cc
+++ b/test/syscalls/linux/rseq/rseq.cc
@@ -49,7 +49,7 @@ int strcmp(const char* s1, const char* s2) {
   return static_cast<int>(*p1) - static_cast<int>(*p2);
 }
 
-int sys_rseq(struct rseq* rseq, uint32 rseq_len, int flags, uint32 sig) {
+int sys_rseq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
   return raw_syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
 }
 
@@ -176,10 +176,10 @@ int TestAbort() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_abort);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
 
   // Loops until abort. If this returns then abort occurred.
   rseq_loop(&r, &cs);
@@ -198,10 +198,10 @@ int TestAbortBefore() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_early_abort);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_early_abort);
 
   // Loops until abort. If this returns then abort occurred.
   rseq_loop(&r, &cs);
@@ -220,10 +220,10 @@ int TestAbortSignature() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_abort);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
 
   // Loops until abort. This should SIGSEGV on abort.
   rseq_loop(&r, &cs);
@@ -242,10 +242,10 @@ int TestAbortPreCommit() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_pre_commit);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_pre_commit);
 
   // Loops until abort. This should SIGSEGV on abort.
   rseq_loop(&r, &cs);
@@ -264,10 +264,10 @@ int TestAbortClearsCS() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_abort);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
 
   // Loops until abort. If this returns then abort occurred.
   rseq_loop(&r, &cs);
@@ -290,10 +290,10 @@ int TestInvalidAbortClearsCS() {
   struct rseq_cs cs = {};
   cs.version = 0;
   cs.flags = 0;
-  cs.start_ip = reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.post_commit_offset = reinterpret_cast<uint64>(&rseq_loop_post_commit) -
-                          reinterpret_cast<uint64>(&rseq_loop_start);
-  cs.abort_ip = reinterpret_cast<uint64>(&rseq_loop_abort);
+  cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+                          reinterpret_cast<uint64_t>(&rseq_loop_start);
+  cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
 
   __atomic_store_n(&r.rseq_cs, &cs, __ATOMIC_RELAXED);
 
diff --git a/test/syscalls/linux/rseq/types.h b/test/syscalls/linux/rseq/types.h
index 7f1e0c5c2..b6afe9817 100644
--- a/test/syscalls/linux/rseq/types.h
+++ b/test/syscalls/linux/rseq/types.h
@@ -18,14 +18,14 @@
 using size_t = __SIZE_TYPE__;
 using uintptr_t = __UINTPTR_TYPE__;
 
-using uint8 = __UINT8_TYPE__;
-using uint16 = __UINT16_TYPE__;
-using uint32 = __UINT32_TYPE__;
-using uint64 = __UINT64_TYPE__;
+using uint8_t = __UINT8_TYPE__;
+using uint16_t = __UINT16_TYPE__;
+using uint32_t = __UINT32_TYPE__;
+using uint64_t = __UINT64_TYPE__;
 
-using int8 = __INT8_TYPE__;
-using int16 = __INT16_TYPE__;
-using int32 = __INT32_TYPE__;
-using int64 = __INT64_TYPE__;
+using int8_t = __INT8_TYPE__;
+using int16_t = __INT16_TYPE__;
+using int32_t = __INT32_TYPE__;
+using int64_t = __INT64_TYPE__;
 
 #endif  // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 7a2c1191a..7e41fe7d8 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -49,12 +49,12 @@ namespace testing {
 namespace {
 
 // A syscall not implemented by Linux that we don't expect to be called.
-constexpr uint32 kFilteredSyscall = SYS_vserver;
+constexpr uint32_t kFilteredSyscall = SYS_vserver;
 
 // Applies a seccomp-bpf filter that returns `filtered_result` for
 // `sysno` and allows all other syscalls. Async-signal-safe.
-void ApplySeccompFilter(uint32 sysno, uint32 filtered_result,
-                        uint32 flags = 0) {
+void ApplySeccompFilter(uint32_t sysno, uint32_t filtered_result,
+                        uint32_t flags = 0) {
   // "Prior to [PR_SET_SECCOMP], the task must call prctl(PR_SET_NO_NEW_PRIVS,
   // 1) or run with CAP_SYS_ADMIN privileges in its namespace." -
   // Documentation/prctl/seccomp_filter.txt
@@ -162,7 +162,7 @@ TEST(SeccompTest, RetKillOnlyKillsOneThread) {
 TEST(SeccompTest, RetTrapCausesSIGSYS) {
   pid_t const pid = fork();
   if (pid == 0) {
-    constexpr uint16 kTrapValue = 0xdead;
+    constexpr uint16_t kTrapValue = 0xdead;
     RegisterSignalHandler(
         SIGSYS, +[](int signo, siginfo_t* info, void* ucv) {
           ucontext_t* uc = static_cast<ucontext_t*>(ucv);
@@ -191,7 +191,7 @@ TEST(SeccompTest, RetTrapCausesSIGSYS) {
 
 #ifdef __x86_64__
 
-constexpr uint64 kVsyscallTimeEntry = 0xffffffffff600400;
+constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
 
 time_t vsyscall_time(time_t* t) {
   return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
@@ -202,7 +202,7 @@ TEST(SeccompTest, SeccompAppliesToVsyscall) {
 
   pid_t const pid = fork();
   if (pid == 0) {
-    constexpr uint16 kTrapValue = 0xdead;
+    constexpr uint16_t kTrapValue = 0xdead;
     RegisterSignalHandler(
         SIGSYS, +[](int signo, siginfo_t* info, void* ucv) {
           ucontext_t* uc = static_cast<ucontext_t*>(ucv);
@@ -335,7 +335,7 @@ TEST(SeccompTest, TsyncAppliesToAllThreads) {
 
 // This test will validate that seccomp(2) rejects unsupported flags.
 TEST(SeccompTest, SeccompRejectsUnknownFlags) {
-  constexpr uint32 kInvalidFlag = 123;
+  constexpr uint32_t kInvalidFlag = 123;
   ASSERT_THAT(
       syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, kInvalidFlag, nullptr),
       SyscallFailsWithErrno(EINVAL));
diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
index a9e8a44c1..e9b131ca9 100644
--- a/test/syscalls/linux/semaphore.cc
+++ b/test/syscalls/linux/semaphore.cc
@@ -274,7 +274,7 @@ TEST(SemaphoreTest, SemOpRandom) {
 
   // Protects the seed below.
   absl::Mutex mutex;
-  uint32 seed = time(nullptr);
+  uint32_t seed = time(nullptr);
 
   int count = 0;      // Tracks semaphore value.
   bool done = false;  // Tells waiters to stop after signal threads are done.
@@ -284,7 +284,7 @@ TEST(SemaphoreTest, SemOpRandom) {
   for (auto& dec : decs) {
     dec = absl::make_unique<ScopedThread>([&sem, &mutex, &count, &seed, &done] {
       for (size_t i = 0; i < 500; ++i) {
-        int16 val;
+        int16_t val;
         {
           absl::MutexLock l(&mutex);
           if (done) {
@@ -325,7 +325,7 @@ TEST(SemaphoreTest, SemOpRandom) {
   for (auto& inc : incs) {
     inc = absl::make_unique<ScopedThread>([&sem, &mutex, &count, &seed] {
       for (size_t i = 0; i < 500; ++i) {
-        int16 val;
+        int16_t val;
         {
           absl::MutexLock l(&mutex);
           val = (rand_r(&seed) % 10 + 1);  // Rand between 1 and 10.
@@ -415,14 +415,14 @@ TEST(SemaphoreTest, SemCtlValAll) {
   ASSERT_THAT(sem.get(), SyscallSucceeds());
 
   // Semaphores must start with 0.
-  uint16 get[3] = {10, 10, 10};
+  uint16_t get[3] = {10, 10, 10};
   EXPECT_THAT(semctl(sem.get(), 1, GETALL, get), SyscallSucceedsWithValue(0));
   for (auto v : get) {
     EXPECT_EQ(v, 0);
   }
 
   // SetAll and check that they were set.
-  uint16 vals[3] = {0, 10, 20};
+  uint16_t vals[3] = {0, 10, 20};
   EXPECT_THAT(semctl(sem.get(), 1, SETALL, vals), SyscallSucceedsWithValue(0));
   EXPECT_THAT(semctl(sem.get(), 1, GETALL, get), SyscallSucceedsWithValue(0));
   for (size_t i = 0; i < ABSL_ARRAYSIZE(vals); ++i) {
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
index 80700615f..7ba752599 100644
--- a/test/syscalls/linux/shm.cc
+++ b/test/syscalls/linux/shm.cc
@@ -30,7 +30,7 @@ namespace {
 
 using ::testing::_;
 
-const uint64 kAllocSize = kPageSize * 128ULL;
+const uint64_t kAllocSize = kPageSize * 128ULL;
 
 PosixErrorOr<char*> Shmat(int shmid, const void* shmaddr, int shmflg) {
   const intptr_t addr =
@@ -320,11 +320,11 @@ TEST(ShmTest, RemovedSegmentsAreDestroyed) {
       Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
   const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
 
-  const uint64 alloc_pages = kAllocSize / kPageSize;
+  const uint64_t alloc_pages = kAllocSize / kPageSize;
 
   struct shm_info info;
   ASSERT_NO_ERRNO(Shmctl(0 /*ignored*/, SHM_INFO, &info));
-  const uint64 before = info.shm_tot;
+  const uint64_t before = info.shm_tot;
 
   ASSERT_NO_ERRNO(shm.Rmid());
   ASSERT_NO_ERRNO(Shmdt(addr));
@@ -400,7 +400,7 @@ TEST(ShmDeathTest, SegmentNotAccessibleAfterDetach) {
 TEST(ShmTest, RequestingSegmentSmallerThanSHMMINFails) {
   struct shminfo info;
   ASSERT_NO_ERRNO(Shmctl(0, IPC_INFO, &info));
-  const uint64 size = info.shmmin - 1;
+  const uint64_t size = info.shmmin - 1;
   EXPECT_THAT(Shmget(IPC_PRIVATE, size, IPC_CREAT | 0777),
               PosixErrorIs(EINVAL, _));
 }
@@ -408,7 +408,7 @@ TEST(ShmTest, RequestingSegmentSmallerThanSHMMINFails) {
 TEST(ShmTest, RequestingSegmentLargerThanSHMMAXFails) {
   struct shminfo info;
   ASSERT_NO_ERRNO(Shmctl(0, IPC_INFO, &info));
-  const uint64 size = info.shmmax + kPageSize;
+  const uint64_t size = info.shmmax + kPageSize;
   EXPECT_THAT(Shmget(IPC_PRIVATE, size, IPC_CREAT | 0777),
               PosixErrorIs(EINVAL, _));
 }
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index 9a0816e10..62b04ef1d 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -114,7 +114,7 @@ TEST(SigaltstackTest, ResetByExecve) {
 
 volatile bool badhandler_on_sigaltstack = true;      // Set by the handler.
 char* volatile badhandler_low_water_mark = nullptr;  // Set by the handler.
-volatile uint8 badhandler_recursive_faults = 0;      // Consumed by the handler.
+volatile uint8_t badhandler_recursive_faults = 0;    // Consumed by the handler.
 
 void badhandler(int sig, siginfo_t* siginfo, void* arg) {
   char stack_var = 0;
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
index 207506569..a47c781ea 100644
--- a/test/syscalls/linux/sigiret.cc
+++ b/test/syscalls/linux/sigiret.cc
@@ -28,8 +28,8 @@ namespace testing {
 
 namespace {
 
-constexpr uint64 kOrigRcx = 0xdeadbeeffacefeed;
-constexpr uint64 kOrigR11 = 0xfacefeedbaad1dea;
+constexpr uint64_t kOrigRcx = 0xdeadbeeffacefeed;
+constexpr uint64_t kOrigR11 = 0xfacefeedbaad1dea;
 
 volatile int gotvtalrm, ready;
 
@@ -40,8 +40,8 @@ void sigvtalrm(int sig, siginfo_t* siginfo, void* _uc) {
   // - test is in the busy-wait loop waiting for signal.
   // - %rcx and %r11 values in mcontext_t match kOrigRcx and kOrigR11.
   if (ready &&
-      static_cast<uint64>(uc->uc_mcontext.gregs[REG_RCX]) == kOrigRcx &&
-      static_cast<uint64>(uc->uc_mcontext.gregs[REG_R11]) == kOrigR11) {
+      static_cast<uint64_t>(uc->uc_mcontext.gregs[REG_RCX]) == kOrigRcx &&
+      static_cast<uint64_t>(uc->uc_mcontext.gregs[REG_R11]) == kOrigR11) {
     // Modify the values %rcx and %r11 in the ucontext. These are the
     // values seen by the application after the signal handler returns.
     uc->uc_mcontext.gregs[REG_RCX] = ~kOrigRcx;
@@ -69,8 +69,8 @@ TEST(SigIretTest, CheckRcxR11) {
       ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_VIRTUAL, itimer));
 
   // Initialize %rcx and %r11 and spin until the signal handler returns.
-  uint64 rcx = kOrigRcx;
-  uint64 r11 = kOrigR11;
+  uint64_t rcx = kOrigRcx;
+  uint64_t r11 = kOrigR11;
   asm volatile(
       "movq %[rcx], %%rcx;"                      // %rcx = rcx
       "movq %[r11], %%r11;"                      // %r11 = r11
@@ -91,7 +91,7 @@ TEST(SigIretTest, CheckRcxR11) {
   EXPECT_EQ(r11, ~kOrigR11);
 }
 
-constexpr uint64 kNonCanonicalRip = 0xCCCC000000000000;
+constexpr uint64_t kNonCanonicalRip = 0xCCCC000000000000;
 
 // Test that a non-canonical signal handler faults as expected.
 TEST(SigIretTest, BadHandler) {
diff --git a/test/syscalls/linux/socket_bind_to_device_distribution.cc b/test/syscalls/linux/socket_bind_to_device_distribution.cc
index c705da1b4..5ed57625c 100644
--- a/test/syscalls/linux/socket_bind_to_device_distribution.cc
+++ b/test/syscalls/linux/socket_bind_to_device_distribution.cc
@@ -77,13 +77,13 @@ class BindToDeviceDistributionTest
   }
 };
 
-PosixErrorOr<uint16> AddrPort(int family, sockaddr_storage const& addr) {
+PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
   switch (family) {
     case AF_INET:
-      return static_cast<uint16>(
+      return static_cast<uint16_t>(
           reinterpret_cast<sockaddr_in const*>(&addr)->sin_port);
     case AF_INET6:
-      return static_cast<uint16>(
+      return static_cast<uint16_t>(
           reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port);
     default:
       return PosixError(EINVAL,
@@ -91,7 +91,7 @@ PosixErrorOr<uint16> AddrPort(int family, sockaddr_storage const& addr) {
   }
 }
 
-PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16 port) {
+PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) {
   switch (family) {
     case AF_INET:
       reinterpret_cast<sockaddr_in*>(addr)->sin_port = port;
@@ -157,7 +157,7 @@ TEST_P(BindToDeviceDistributionTest, Tcp) {
         getsockname(listener_fds[0].get(),
                     reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
         SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
   }
@@ -190,7 +190,7 @@ TEST_P(BindToDeviceDistributionTest, Tcp) {
             // cause the test to use absurd amounts of memory.
             //
             // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
-            uint16 data;
+            uint16_t data;
             EXPECT_THAT(
                 RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
                 SyscallSucceedsWithValue(sizeof(data)));
@@ -296,7 +296,7 @@ TEST_P(BindToDeviceDistributionTest, Udp) {
         getsockname(listener_fds[0].get(),
                     reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
         SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
     ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index ee9856f7f..e8f24a59e 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -507,7 +507,7 @@ TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
 
   struct timeval_with_extra {
     struct timeval tv;
-    int64 extra_data;
+    int64_t extra_data;
   } ABSL_ATTRIBUTE_PACKED;
 
   timeval_with_extra tv_extra;
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 12df2b35a..2f9821555 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -47,13 +47,13 @@ namespace {
 
 using ::testing::Gt;
 
-PosixErrorOr<uint16> AddrPort(int family, sockaddr_storage const& addr) {
+PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
   switch (family) {
     case AF_INET:
-      return static_cast<uint16>(
+      return static_cast<uint16_t>(
           reinterpret_cast<sockaddr_in const*>(&addr)->sin_port);
     case AF_INET6:
-      return static_cast<uint16>(
+      return static_cast<uint16_t>(
           reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port);
     default:
       return PosixError(EINVAL,
@@ -61,7 +61,7 @@ PosixErrorOr<uint16> AddrPort(int family, sockaddr_storage const& addr) {
   }
 }
 
-PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16 port) {
+PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) {
   switch (family) {
     case AF_INET:
       reinterpret_cast<sockaddr_in*>(addr)->sin_port = port;
@@ -276,7 +276,7 @@ void tcpSimpleConnectTest(TestAddress const& listener,
   ASSERT_THAT(getsockname(listen_fd.get(),
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Connect to the listening socket.
@@ -339,7 +339,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   ASSERT_THAT(getsockname(listen_fd.get(),
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   DisableSave ds;  // Too many system calls.
@@ -400,7 +400,7 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog) {
   ASSERT_THAT(getsockname(listen_fd.get(),
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
   int i = 0;
   while (1) {
@@ -468,7 +468,7 @@ TEST_P(SocketInetLoopbackTest, TCPFinWait2Test_NoRandomSave) {
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
 
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Connect to the listening socket.
@@ -576,7 +576,7 @@ TEST_P(SocketInetLoopbackTest, TCPLinger2TimeoutAfterClose_NoRandomSave) {
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
 
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Connect to the listening socket.
@@ -650,7 +650,7 @@ TEST_P(SocketInetLoopbackTest, TCPResetAfterClose) {
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
 
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Connect to the listening socket.
@@ -717,7 +717,7 @@ TEST_P(SocketInetLoopbackTest, TCPTimeWaitTest_NoRandomSave) {
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
 
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Connect to the listening socket.
@@ -794,7 +794,7 @@ TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
                           reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
               SyscallSucceeds());
 
-  const uint16 port =
+  const uint16_t port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
   // Set the userTimeout on the listening socket.
@@ -898,7 +898,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
         getsockname(listener_fds[0].get(),
                     reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
         SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
     ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
@@ -935,7 +935,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
             // cause the test to use absurd amounts of memory.
             //
             // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
-            uint16 data;
+            uint16_t data;
             EXPECT_THAT(
                 RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
                 SyscallSucceedsWithValue(sizeof(data)));
@@ -1022,7 +1022,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
         getsockname(listener_fds[0].get(),
                     reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
         SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
     ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
@@ -1138,7 +1138,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
         getsockname(listener_fds[0].get(),
                     reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
         SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
     ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
     ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
@@ -1174,7 +1174,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
     pollfds[i].events = POLLIN;
   }
 
-  std::map<uint16, int> portToFD;
+  std::map<uint16_t, int> portToFD;
 
   int received = 0;
   while (received < kConnectAttempts * 2) {
@@ -1196,7 +1196,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
                       fd, &data, sizeof(data), 0,
                       reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
                   SyscallSucceedsWithValue(sizeof(data)));
-      uint16 const port =
+      uint16_t const port =
           ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr));
       auto prev_port = portToFD.find(port);
       // Check that all packets from one client have been delivered to the
@@ -1257,7 +1257,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedLoopbackOnlyReservesV4) {
     ASSERT_THAT(getsockname(fd_dual.get(),
                             reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
                 SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
 
     // Verify that we can still bind the v6 loopback on the same port.
@@ -1309,7 +1309,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedAnyOnlyReservesV4) {
     ASSERT_THAT(getsockname(fd_dual.get(),
                             reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
                 SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
 
     // Verify that we can still bind the v6 loopback on the same port.
@@ -1360,7 +1360,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, DualStackV6AnyReservesEverything) {
   ASSERT_THAT(getsockname(fd_dual.get(),
                           reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
               SyscallSucceeds());
-  uint16 const port =
+  uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
 
   // Verify that binding the v6 loopback with the same port fails.
@@ -1419,7 +1419,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) {
     ASSERT_THAT(getsockname(fd_dual.get(),
                             reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
                 SyscallSucceeds());
-    uint16 const port =
+    uint16_t const port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
 
     // Verify that binding the v6 loopback with the same port fails.
@@ -1498,7 +1498,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
                             reinterpret_cast<sockaddr*>(&connected_addr),
                             &connected_addr_len),
                 SyscallSucceeds());
-    uint16 const ephemeral_port =
+    uint16_t const ephemeral_port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
     // Verify that we actually got an ephemeral port.
@@ -1603,7 +1603,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReservedReuseAddr) {
                           reinterpret_cast<sockaddr*>(&connected_addr),
                           &connected_addr_len),
               SyscallSucceeds());
-  uint16 const ephemeral_port =
+  uint16_t const ephemeral_port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
   // Verify that we actually got an ephemeral port.
@@ -1665,7 +1665,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
                             reinterpret_cast<sockaddr*>(&connected_addr),
                             &connected_addr_len),
                 SyscallSucceeds());
-    uint16 const ephemeral_port =
+    uint16_t const ephemeral_port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
     // Verify that we actually got an ephemeral port.
@@ -1794,7 +1794,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest,
                           reinterpret_cast<sockaddr*>(&connected_addr),
                           &connected_addr_len),
               SyscallSucceeds());
-  uint16 const ephemeral_port =
+  uint16_t const ephemeral_port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
   // Verify that we actually got an ephemeral port.
@@ -1856,7 +1856,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
                             reinterpret_cast<sockaddr*>(&connected_addr),
                             &connected_addr_len),
                 SyscallSucceeds());
-    uint16 const ephemeral_port =
+    uint16_t const ephemeral_port =
         ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
     // Verify that we actually got an ephemeral port.
@@ -1988,7 +1988,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) {
                           reinterpret_cast<sockaddr*>(&connected_addr),
                           &connected_addr_len),
               SyscallSucceeds());
-  uint16 const ephemeral_port =
+  uint16_t const ephemeral_port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
 
   // Verify that we actually got an ephemeral port.
diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc
index 4a8337159..ca597e267 100644
--- a/test/syscalls/linux/socket_ip_unbound.cc
+++ b/test/syscalls/linux/socket_ip_unbound.cc
@@ -223,7 +223,7 @@ TEST_P(IPUnboundSocketTest, CheckSkipECN) {
   TOSOption t = GetTOSOption(GetParam().domain);
   EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
               SyscallSucceedsWithValue(0));
-  int expect = static_cast<uint8>(set);
+  int expect = static_cast<uint8_t>(set);
   if (GetParam().protocol == IPPROTO_TCP) {
     expect &= ~INET_ECN_MASK;
   }
@@ -267,7 +267,7 @@ TEST_P(IPUnboundSocketTest, SmallTOSOptionSize) {
       EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, i),
                   SyscallSucceedsWithValue(0));
       expect_tos = set;
-      expect_sz = sizeof(uint8);
+      expect_sz = sizeof(uint8_t);
     } else {
       EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, i),
                   SyscallFailsWithErrno(EINVAL));
@@ -314,7 +314,7 @@ TEST_P(IPUnboundSocketTest, NegativeTOS) {
               SyscallSucceedsWithValue(0));
   int expect;
   if (GetParam().domain == AF_INET) {
-    expect = static_cast<uint8>(set);
+    expect = static_cast<uint8_t>(set);
     if (GetParam().protocol == IPPROTO_TCP) {
       expect &= ~INET_ECN_MASK;
     }
@@ -340,7 +340,7 @@ TEST_P(IPUnboundSocketTest, InvalidNegativeTOS) {
   if (GetParam().domain == AF_INET) {
     EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
                 SyscallSucceedsWithValue(0));
-    expect = static_cast<uint8>(set);
+    expect = static_cast<uint8_t>(set);
     if (GetParam().protocol == IPPROTO_TCP) {
       expect &= ~INET_ECN_MASK;
     }
diff --git a/test/syscalls/linux/socket_netdevice.cc b/test/syscalls/linux/socket_netdevice.cc
index 689014a59..405dbbd73 100644
--- a/test/syscalls/linux/socket_netdevice.cc
+++ b/test/syscalls/linux/socket_netdevice.cc
@@ -70,14 +70,14 @@ TEST(NetdeviceTest, Netmask) {
   // netmask obtained via ioctl.
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
-  uint32 port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
     struct nlmsghdr hdr;
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
@@ -109,7 +109,7 @@ TEST(NetdeviceTest, Netmask) {
 
         struct ifaddrmsg *ifaddrmsg =
             reinterpret_cast<struct ifaddrmsg *>(NLMSG_DATA(hdr));
-        if (ifaddrmsg->ifa_index == static_cast<uint32>(ifr.ifr_ifindex) &&
+        if (ifaddrmsg->ifa_index == static_cast<uint32_t>(ifr.ifr_ifindex) &&
             ifaddrmsg->ifa_family == AF_INET) {
           prefixlen = ifaddrmsg->ifa_prefixlen;
         }
@@ -120,7 +120,7 @@ TEST(NetdeviceTest, Netmask) {
 
   // Netmask is stored big endian in struct sockaddr_in, so we do the same for
   // comparison.
-  uint32 mask = 0xffffffff << (32 - prefixlen);
+  uint32_t mask = 0xffffffff << (32 - prefixlen);
   mask = absl::gbswap_32(mask);
 
   // Check that the loopback interface has the correct subnet mask.
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 5612f1a13..ef567f512 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -116,14 +116,14 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
 TEST(NetlinkRouteTest, GetLinkDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
-  uint32 port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -164,7 +164,7 @@ TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -198,7 +198,7 @@ TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -238,7 +238,7 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -274,7 +274,7 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
 TEST(NetlinkRouteTest, ControlMessageIgnored) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
-  uint32 port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
     struct nlmsghdr control_hdr;
@@ -282,7 +282,7 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
 
@@ -310,14 +310,14 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) {
 TEST(NetlinkRouteTest, GetAddrDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
-  uint32 port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
     struct nlmsghdr hdr;
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
@@ -371,14 +371,14 @@ TEST(NetlinkRouteTest, LookupAll) {
 TEST(NetlinkRouteTest, GetRouteDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
-  uint32 port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
   struct request {
     struct nlmsghdr hdr;
     struct rtmsg rtm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -454,7 +454,7 @@ TEST(NetlinkRouteTest, RecvmsgTrunc) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
@@ -531,7 +531,7 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
@@ -611,7 +611,7 @@ TEST(NetlinkRouteTest, NoPasscredNoCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
@@ -659,7 +659,7 @@ TEST(NetlinkRouteTest, PasscredCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32 kSeq = 12345;
+  constexpr uint32_t kSeq = 12345;
 
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index 17f99c238..723f5d728 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -40,7 +40,7 @@ PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol) {
   return std::move(fd);
 }
 
-PosixErrorOr<uint32> NetlinkPortID(int fd) {
+PosixErrorOr<uint32_t> NetlinkPortID(int fd) {
   struct sockaddr_nl addr;
   socklen_t addrlen = sizeof(addr);
 
@@ -48,7 +48,7 @@ PosixErrorOr<uint32> NetlinkPortID(int fd) {
       getsockname(fd, reinterpret_cast<struct sockaddr*>(&addr), &addrlen));
   MaybeSave();
 
-  return static_cast<uint32>(addr.nl_pid);
+  return static_cast<uint32_t>(addr.nl_pid);
 }
 
 PosixError NetlinkRequestResponse(
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index bd0c1d79b..76e772c48 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -30,7 +30,7 @@ namespace testing {
 PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol);
 
 // Returns the port ID of the passed socket.
-PosixErrorOr<uint32> NetlinkPortID(int fd);
+PosixErrorOr<uint32_t> NetlinkPortID(int fd);
 
 // Send the passed request and call fn will all response netlink messages.
 PosixError NetlinkRequestResponse(
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 2169ff1c6..eff7d577e 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -507,7 +507,7 @@ void TransferTest(int fd1, int fd2) {
 
 // Initializes the given buffer with random data.
 void RandomizeBuffer(char* ptr, size_t len) {
-  uint32 seed = time(nullptr);
+  uint32_t seed = time(nullptr);
   for (size_t i = 0; i < len; ++i) {
     ptr[i] = static_cast<char>(rand_r(&seed));
   }
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index 562b6a8d4..85232cb1f 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -139,7 +139,7 @@ TEST(SpliceTest, PipeOffsets) {
 // Event FDs may be used with splice without an offset.
 TEST(SpliceTest, FromEventFD) {
   // Open the input eventfd with an initial value so that it is readable.
-  constexpr uint64 kEventFDValue = 1;
+  constexpr uint64_t kEventFDValue = 1;
   int efd;
   ASSERT_THAT(efd = eventfd(kEventFDValue, 0), SyscallSucceeds());
   const FileDescriptor in_fd(efd);
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 6b259cb89..30de2f8ff 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -568,35 +568,35 @@ TEST(SimpleStatTest, AnonDeviceAllocatesUniqueInodesAcrossSaveRestore) {
 
 // struct kernel_statx_timestamp is a Linux statx_timestamp struct.
 struct kernel_statx_timestamp {
-  int64 tv_sec;
-  uint32 tv_nsec;
-  int32 __reserved;
+  int64_t tv_sec;
+  uint32_t tv_nsec;
+  int32_t __reserved;
 };
 
 // struct kernel_statx is a Linux statx struct. Old versions of glibc do not
 // expose it. See include/uapi/linux/stat.h
 struct kernel_statx {
-  uint32 stx_mask;
-  uint32 stx_blksize;
-  uint64 stx_attributes;
-  uint32 stx_nlink;
-  uint32 stx_uid;
-  uint32 stx_gid;
-  uint16 stx_mode;
-  uint16 __spare0[1];
-  uint64 stx_ino;
-  uint64 stx_size;
-  uint64 stx_blocks;
-  uint64 stx_attributes_mask;
+  uint32_t stx_mask;
+  uint32_t stx_blksize;
+  uint64_t stx_attributes;
+  uint32_t stx_nlink;
+  uint32_t stx_uid;
+  uint32_t stx_gid;
+  uint16_t stx_mode;
+  uint16_t __spare0[1];
+  uint64_t stx_ino;
+  uint64_t stx_size;
+  uint64_t stx_blocks;
+  uint64_t stx_attributes_mask;
   struct kernel_statx_timestamp stx_atime;
   struct kernel_statx_timestamp stx_btime;
   struct kernel_statx_timestamp stx_ctime;
   struct kernel_statx_timestamp stx_mtime;
-  uint32 stx_rdev_major;
-  uint32 stx_rdev_minor;
-  uint32 stx_dev_major;
-  uint32 stx_dev_minor;
-  uint64 __spare2[14];
+  uint32_t stx_rdev_major;
+  uint32_t stx_rdev_minor;
+  uint32_t stx_dev_major;
+  uint32_t stx_dev_minor;
+  uint64_t __spare2[14];
 };
 
 int statx(int dirfd, const char *pathname, int flags, unsigned int mask,
diff --git a/test/syscalls/linux/sticky.cc b/test/syscalls/linux/sticky.cc
index abcabaffb..7e73325bf 100644
--- a/test/syscalls/linux/sticky.cc
+++ b/test/syscalls/linux/sticky.cc
@@ -29,8 +29,8 @@
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
-ABSL_FLAG(int32, scratch_uid, 65534, "first scratch UID");
-ABSL_FLAG(int32, scratch_gid, 65534, "first scratch GID");
+ABSL_FLAG(int32_t, scratch_uid, 65534, "first scratch UID");
+ABSL_FLAG(int32_t, scratch_gid, 65534, "first scratch GID");
 
 namespace gvisor {
 namespace testing {
diff --git a/test/syscalls/linux/sysret.cc b/test/syscalls/linux/sysret.cc
index d98d6be91..819fa655a 100644
--- a/test/syscalls/linux/sysret.cc
+++ b/test/syscalls/linux/sysret.cc
@@ -26,8 +26,8 @@ namespace testing {
 
 namespace {
 
-constexpr uint64 kNonCanonicalRip = 0xCCCC000000000000;
-constexpr uint64 kNonCanonicalRsp = 0xFFFF000000000000;
+constexpr uint64_t kNonCanonicalRip = 0xCCCC000000000000;
+constexpr uint64_t kNonCanonicalRsp = 0xFFFF000000000000;
 
 class SysretTest : public ::testing::Test {
  protected:
@@ -60,12 +60,12 @@ class SysretTest : public ::testing::Test {
     ASSERT_THAT(ptrace(PTRACE_DETACH, child_, 0, 0), SyscallSucceeds());
   }
 
-  void SetRip(uint64 newrip) {
+  void SetRip(uint64_t newrip) {
     regs_.rip = newrip;
     ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
   }
 
-  void SetRsp(uint64 newrsp) {
+  void SetRsp(uint64_t newrsp) {
     regs_.rsp = newrsp;
     ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
   }
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index cb304d6f5..33a5ac66c 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -640,7 +640,7 @@ TEST_P(TcpSocketTest, Tiocinq) {
   size_t size = sizeof(buf);
   ASSERT_THAT(RetryEINTR(write)(s_, buf, size), SyscallSucceedsWithValue(size));
 
-  uint32 seed = time(nullptr);
+  uint32_t seed = time(nullptr);
   const size_t max_chunk = size / 10;
   while (size > 0) {
     size_t chunk = (rand_r(&seed) % max_chunk) + 1;
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index 03e028f50..c7eead17e 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -28,7 +28,7 @@ constexpr long kFudgeSeconds = 5;
 
 // Mimics the time(2) wrapper from glibc prior to 2.15.
 time_t vsyscall_time(time_t* t) {
-  constexpr uint64 kVsyscallTimeEntry = 0xffffffffff600400;
+  constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
   return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
 }
 
@@ -63,7 +63,7 @@ TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) {
 }
 
 int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) {
-  constexpr uint64 kVsyscallGettimeofdayEntry = 0xffffffffff600000;
+  constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000;
   return reinterpret_cast<int (*)(struct timeval*, struct timezone*)>(
       kVsyscallGettimeofdayEntry)(tv, tz);
 }
diff --git a/test/syscalls/linux/timerfd.cc b/test/syscalls/linux/timerfd.cc
index d87dbc666..86ed87b7c 100644
--- a/test/syscalls/linux/timerfd.cc
+++ b/test/syscalls/linux/timerfd.cc
@@ -69,9 +69,9 @@ TEST_P(TimerfdTest, SingleShot) {
 
   // The timer should fire exactly once since the interval is zero.
   absl::SleepFor(kDelay + TimerSlack());
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   EXPECT_EQ(1, val);
 }
 
@@ -89,9 +89,9 @@ TEST_P(TimerfdTest, Periodic) {
   // Expect to see at least kPeriods expirations. More may occur due to the
   // timer slack, or due to delays from scheduling or save/restore.
   absl::SleepFor(kPeriods * kDelay + TimerSlack());
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   EXPECT_GE(val, kPeriods);
 }
 
@@ -106,9 +106,9 @@ TEST_P(TimerfdTest, BlockingRead) {
               SyscallSucceeds());
 
   // read should block until the timer fires.
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   auto const end_time = absl::Now();
   EXPECT_EQ(1, val);
   EXPECT_GE((end_time - start_time) + TimerSlack(), kDelay);
@@ -122,8 +122,8 @@ TEST_P(TimerfdTest, NonblockingRead_NoRandomSave) {
 
   // Since the timer is initially disabled and has never fired, read should
   // return EAGAIN.
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
               SyscallFailsWithErrno(EAGAIN));
 
   DisableSave ds;  // Timing-sensitive.
@@ -135,19 +135,19 @@ TEST_P(TimerfdTest, NonblockingRead_NoRandomSave) {
               SyscallSucceeds());
 
   // Since the timer has not yet fired, read should return EAGAIN.
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
               SyscallFailsWithErrno(EAGAIN));
 
   ds.reset();  // No longer timing-sensitive.
 
   // After the timer fires, read should indicate 1 expiration.
   absl::SleepFor(kDelay + TimerSlack());
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   EXPECT_EQ(1, val);
 
   // The successful read should have reset the number of expirations.
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
               SyscallFailsWithErrno(EAGAIN));
 }
 
@@ -179,8 +179,8 @@ TEST_P(TimerfdTest, BlockingPoll_SetTimeResetsExpirations) {
   its.it_value.tv_sec = 0;
   ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
               SyscallSucceeds());
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
               SyscallFailsWithErrno(EAGAIN));
 }
 
@@ -198,16 +198,16 @@ TEST_P(TimerfdTest, SetAbsoluteTime) {
               SyscallSucceeds());
 
   absl::SleepFor(kDelay + TimerSlack());
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   EXPECT_EQ(1, val);
 }
 
 TEST_P(TimerfdTest, IllegalReadWrite) {
   auto const tfd =
       ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), TFD_NONBLOCK));
-  uint64 val = 0;
+  uint64_t val = 0;
   EXPECT_THAT(PreadFd(tfd.get(), &val, sizeof(val), 0),
               SyscallFailsWithErrno(ESPIPE));
   EXPECT_THAT(WriteFd(tfd.get(), &val, sizeof(val)),
@@ -244,9 +244,9 @@ TEST(TimerfdClockRealtimeTest, ClockRealtime) {
   ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
               SyscallSucceeds());
 
-  uint64 val = 0;
-  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64)),
-              SyscallSucceedsWithValue(sizeof(uint64)));
+  uint64_t val = 0;
+  ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+              SyscallSucceedsWithValue(sizeof(uint64_t)));
   EXPECT_EQ(1, val);
 }
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index af94d7baa..a2f6ef8cc 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -34,7 +34,7 @@ namespace gvisor {
 namespace testing {
 
 // Gets a pointer to the port component of the given address.
-uint16* Port(struct sockaddr_storage* addr) {
+uint16_t* Port(struct sockaddr_storage* addr) {
   switch (addr->ss_family) {
     case AF_INET: {
       auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
@@ -331,7 +331,7 @@ TEST_P(UdpSocketTest, Connect) {
   EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
 }
 
-void ConnectAny(AddressFamily family, int sockfd, uint16 port) {
+void ConnectAny(AddressFamily family, int sockfd, uint16_t port) {
   struct sockaddr_storage addr = {};
 
   // Precondition check.
@@ -1398,7 +1398,7 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
   received_iov.iov_len = kDataLength;
   received_msg.msg_iov = &received_iov;
   received_msg.msg_iovlen = 1;
-  size_t cmsg_data_len = sizeof(int8);
+  size_t cmsg_data_len = sizeof(int8_t);
   if (sent_type == IPV6_TCLASS) {
     cmsg_data_len = sizeof(int);
   }
@@ -1413,7 +1413,7 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
   EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
   EXPECT_EQ(cmsg->cmsg_level, sent_level);
   EXPECT_EQ(cmsg->cmsg_type, sent_type);
-  int8 received_tos = 0;
+  int8_t received_tos = 0;
   memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
   EXPECT_EQ(received_tos, sent_tos);
 }
@@ -1453,7 +1453,7 @@ TEST_P(UdpSocketTest, SendAndReceiveTOS) {
   sent_iov.iov_len = kDataLength;
   sent_msg.msg_iov = &sent_iov;
   sent_msg.msg_iovlen = 1;
-  size_t cmsg_data_len = sizeof(int8);
+  size_t cmsg_data_len = sizeof(int8_t);
   if (sent_level == SOL_IPV6) {
     sent_type = IPV6_TCLASS;
     cmsg_data_len = sizeof(int);
@@ -1467,7 +1467,7 @@ TEST_P(UdpSocketTest, SendAndReceiveTOS) {
   sent_cmsg->cmsg_len = CMSG_LEN(cmsg_data_len);
   sent_cmsg->cmsg_level = sent_level;
   sent_cmsg->cmsg_type = sent_type;
-  *(int8*)CMSG_DATA(sent_cmsg) = sent_tos;
+  *(int8_t*)CMSG_DATA(sent_cmsg) = sent_tos;
 
   ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0),
               SyscallSucceedsWithValue(kDataLength));
@@ -1491,7 +1491,7 @@ TEST_P(UdpSocketTest, SendAndReceiveTOS) {
   EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
   EXPECT_EQ(cmsg->cmsg_level, sent_level);
   EXPECT_EQ(cmsg->cmsg_type, sent_type);
-  int8 received_tos = 0;
+  int8_t received_tos = 0;
   memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
   EXPECT_EQ(received_tos, sent_tos);
 }
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index e0e39e5e3..6218fbce1 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -27,10 +27,10 @@
 #include "test/util/thread_util.h"
 #include "test/util/uid_util.h"
 
-ABSL_FLAG(int32, scratch_uid1, 65534, "first scratch UID");
-ABSL_FLAG(int32, scratch_uid2, 65533, "second scratch UID");
-ABSL_FLAG(int32, scratch_gid1, 65534, "first scratch GID");
-ABSL_FLAG(int32, scratch_gid2, 65533, "second scratch GID");
+ABSL_FLAG(int32_t, scratch_uid1, 65534, "first scratch UID");
+ABSL_FLAG(int32_t, scratch_uid2, 65533, "second scratch UID");
+ABSL_FLAG(int32_t, scratch_gid1, 65534, "first scratch GID");
+ABSL_FLAG(int32_t, scratch_gid2, 65533, "second scratch GID");
 
 using ::testing::UnorderedElementsAreArray;
 
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
index e7bae9c07..3a927a430 100644
--- a/test/syscalls/linux/utimes.cc
+++ b/test/syscalls/linux/utimes.cc
@@ -163,12 +163,12 @@ TEST(FutimesatTest, OnRelPath) {
 TEST(FutimesatTest, InvalidNsec) {
   auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   struct timeval times[4][2] = {{
-                                    {0, 1},                       // Valid
-                                    {1, static_cast<int64>(1e7)}  // Invalid
+                                    {0, 1},                         // Valid
+                                    {1, static_cast<int64_t>(1e7)}  // Invalid
                                 },
                                 {
-                                    {1, static_cast<int64>(1e7)},  // Invalid
-                                    {0, 1}                         // Valid
+                                    {1, static_cast<int64_t>(1e7)},  // Invalid
+                                    {0, 1}                           // Valid
                                 },
                                 {
                                     {0, 1},  // Valid
@@ -288,14 +288,15 @@ TEST(UtimeTest, ZeroAtimeandMtime) {
 
 TEST(UtimensatTest, InvalidNsec) {
   auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  struct timespec times[2][2] = {{
-                                     {0, UTIME_OMIT},               // Valid
-                                     {2, static_cast<int64>(1e10)}  // Invalid
-                                 },
-                                 {
-                                     {2, static_cast<int64>(1e10)},  // Invalid
-                                     {0, UTIME_OMIT}                 // Valid
-                                 }};
+  struct timespec times[2][2] = {
+      {
+          {0, UTIME_OMIT},                 // Valid
+          {2, static_cast<int64_t>(1e10)}  // Invalid
+      },
+      {
+          {2, static_cast<int64_t>(1e10)},  // Invalid
+          {0, UTIME_OMIT}                   // Valid
+      }};
 
   for (unsigned int i = 0; i < sizeof(times) / sizeof(times[0]); i++) {
     std::cout << "test:" << i << "\n";
diff --git a/test/syscalls/linux/vfork.cc b/test/syscalls/linux/vfork.cc
index 153b3bd69..0aaba482d 100644
--- a/test/syscalls/linux/vfork.cc
+++ b/test/syscalls/linux/vfork.cc
@@ -51,7 +51,7 @@ constexpr absl::Duration kChildDelay = absl::Seconds(10);
 // errno, so kChildExitCode is chosen to be an unlikely errno:
 constexpr int kChildExitCode = 118;  // ENOTNAM: Not a XENIX named type file
 
-int64 MonotonicNow() {
+int64_t MonotonicNow() {
   struct timespec now;
   TEST_PCHECK(clock_gettime(CLOCK_MONOTONIC, &now) == 0);
   return now.tv_sec * 1000000000ll + now.tv_nsec;
@@ -62,7 +62,7 @@ TEST(VforkTest, ParentStopsUntilChildExits) {
     // N.B. Run the test in a single-threaded subprocess because
     // vfork is not safe in a multi-threaded process.
 
-    const int64 start = MonotonicNow();
+    const int64_t start = MonotonicNow();
 
     pid_t pid = vfork();
     if (pid == 0) {
@@ -72,7 +72,7 @@ TEST(VforkTest, ParentStopsUntilChildExits) {
     TEST_PCHECK_MSG(pid > 0, "vfork failed");
     MaybeSave();
 
-    const int64 end = MonotonicNow();
+    const int64_t end = MonotonicNow();
 
     absl::Duration dur = absl::Nanoseconds(end - start);
 
@@ -92,7 +92,7 @@ TEST(VforkTest, ParentStopsUntilChildExecves_NoRandomSave) {
   char* const* const child_argv = owned_child_argv.get();
 
   const auto test = [&] {
-    const int64 start = MonotonicNow();
+    const int64_t start = MonotonicNow();
 
     pid_t pid = vfork();
     if (pid == 0) {
@@ -104,7 +104,7 @@ TEST(VforkTest, ParentStopsUntilChildExecves_NoRandomSave) {
     // since the test expects an upper bound on the time spent
     // stopped.
     int saved_errno = errno;
-    const int64 end = MonotonicNow();
+    const int64_t end = MonotonicNow();
     errno = saved_errno;
     TEST_PCHECK_MSG(pid > 0, "vfork failed");
     MaybeSave();
@@ -143,7 +143,7 @@ TEST(VforkTest, ExecedChildExitDoesntUnstopParent_NoRandomSave) {
     // pid1 exec'd and is now sleeping.
     SleepSafe(kChildDelay / 2);
 
-    const int64 start = MonotonicNow();
+    const int64_t start = MonotonicNow();
 
     pid_t pid2 = vfork();
     if (pid2 == 0) {
@@ -153,7 +153,7 @@ TEST(VforkTest, ExecedChildExitDoesntUnstopParent_NoRandomSave) {
     TEST_PCHECK_MSG(pid2 > 0, "vfork failed");
     MaybeSave();
 
-    const int64 end = MonotonicNow();
+    const int64_t end = MonotonicNow();
 
     absl::Duration dur = absl::Nanoseconds(end - start);
 
diff --git a/test/syscalls/linux/vsyscall.cc b/test/syscalls/linux/vsyscall.cc
index 99e8c6cea..2c2303358 100644
--- a/test/syscalls/linux/vsyscall.cc
+++ b/test/syscalls/linux/vsyscall.cc
@@ -25,7 +25,7 @@ namespace testing {
 namespace {
 
 time_t vsyscall_time(time_t* t) {
-  constexpr uint64 kVsyscallTimeEntry = 0xffffffffff600400;
+  constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
   return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
 }
 
diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc
index 709b87a21..944149d5e 100644
--- a/test/syscalls/linux/wait.cc
+++ b/test/syscalls/linux/wait.cc
@@ -64,7 +64,7 @@ static const size_t kStackSize = 2 * kPageSize;
 // The child thread created in CloneAndExit runs this function.
 // This child does not have the TLS setup, so it must not use glibc functions.
 int CloneChild(void* priv) {
-  int64 sleep = reinterpret_cast<int64>(priv);
+  int64_t sleep = reinterpret_cast<int64_t>(priv);
   SleepSafe(absl::Seconds(sleep));
 
   // glibc's _exit(2) function wrapper will helpfully call exit_group(2),
@@ -75,7 +75,7 @@ int CloneChild(void* priv) {
 
 // ForkAndExit forks a child process which exits with exit_code, after
 // sleeping for the specified duration (seconds).
-pid_t ForkAndExit(int exit_code, int64 sleep) {
+pid_t ForkAndExit(int exit_code, int64_t sleep) {
   pid_t child = fork();
   if (child == 0) {
     SleepSafe(absl::Seconds(sleep));
@@ -84,16 +84,16 @@ pid_t ForkAndExit(int exit_code, int64 sleep) {
   return child;
 }
 
-int64 clock_gettime_nsecs(clockid_t id) {
+int64_t clock_gettime_nsecs(clockid_t id) {
   struct timespec ts;
   TEST_PCHECK(clock_gettime(id, &ts) == 0);
   return (ts.tv_sec * 1000000000 + ts.tv_nsec);
 }
 
-void spin(int64 sec) {
-  int64 ns = sec * 1000000000;
-  int64 start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
-  int64 end = start + ns;
+void spin(int64_t sec) {
+  int64_t ns = sec * 1000000000;
+  int64_t start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
+  int64_t end = start + ns;
 
   do {
     constexpr int kLoopCount = 1000000;  // large and arbitrary
@@ -105,7 +105,7 @@ void spin(int64 sec) {
 
 // ForkSpinAndExit forks a child process which exits with exit_code, after
 // spinning for the specified duration (seconds).
-pid_t ForkSpinAndExit(int exit_code, int64 spintime) {
+pid_t ForkSpinAndExit(int exit_code, int64_t spintime) {
   pid_t child = fork();
   if (child == 0) {
     spin(spintime);
@@ -141,7 +141,7 @@ int FreeStack(uintptr_t addr) {
 // CloneAndExit clones a child thread, which exits with 0 after sleeping for
 // the specified duration (must be in seconds). extra_flags are ORed against
 // the standard clone(2) flags.
-int CloneAndExit(int64 sleep, uintptr_t stack, int extra_flags) {
+int CloneAndExit(int64_t sleep, uintptr_t stack, int extra_flags) {
   return clone(CloneChild, reinterpret_cast<void*>(stack),
                CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_VM | extra_flags,
                reinterpret_cast<void*>(sleep));
diff --git a/test/util/mount_util.h b/test/util/mount_util.h
index 51119f22f..23eea51a2 100644
--- a/test/util/mount_util.h
+++ b/test/util/mount_util.h
@@ -33,9 +33,9 @@ namespace testing {
 // destroyed.
 inline PosixErrorOr<Cleanup> Mount(const std::string &source,
                                    const std::string &target,
-                                   const std::string &fstype, uint64 mountflags,
-                                   const std::string &data,
-                                   uint64 umountflags) {
+                                   const std::string &fstype,
+                                   uint64_t mountflags, const std::string &data,
+                                   uint64_t umountflags) {
   if (mount(source.c_str(), target.c_str(), fstype.c_str(), mountflags,
             data.c_str()) == -1) {
     return PosixError(errno, "mount failed");
diff --git a/test/util/multiprocess_util.cc b/test/util/multiprocess_util.cc
index ba601f300..8b676751b 100644
--- a/test/util/multiprocess_util.cc
+++ b/test/util/multiprocess_util.cc
@@ -135,7 +135,7 @@ PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
   return ForkAndExecHelper(exec_fn, fn, child, execve_errno);
 }
 
-PosixErrorOr<Cleanup> ForkAndExecveat(const int32 dirfd,
+PosixErrorOr<Cleanup> ForkAndExecveat(const int32_t dirfd,
                                       const std::string& pathname,
                                       const ExecveArray& argv,
                                       const ExecveArray& envv, const int flags,
diff --git a/test/util/multiprocess_util.h b/test/util/multiprocess_util.h
index 342e73a52..3e736261b 100644
--- a/test/util/multiprocess_util.h
+++ b/test/util/multiprocess_util.h
@@ -103,13 +103,14 @@ inline PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
 }
 
 // Equivalent to ForkAndExec, except using dirfd and flags with execveat.
-PosixErrorOr<Cleanup> ForkAndExecveat(int32 dirfd, const std::string& pathname,
+PosixErrorOr<Cleanup> ForkAndExecveat(int32_t dirfd,
+                                      const std::string& pathname,
                                       const ExecveArray& argv,
                                       const ExecveArray& envv, int flags,
                                       const std::function<void()>& fn,
                                       pid_t* child, int* execve_errno);
 
-inline PosixErrorOr<Cleanup> ForkAndExecveat(int32 dirfd,
+inline PosixErrorOr<Cleanup> ForkAndExecveat(int32_t dirfd,
                                              const std::string& pathname,
                                              const ExecveArray& argv,
                                              const ExecveArray& envv, int flags,
diff --git a/test/util/proc_util.cc b/test/util/proc_util.cc
index c81f363ef..34d636ba9 100644
--- a/test/util/proc_util.cc
+++ b/test/util/proc_util.cc
@@ -72,7 +72,7 @@ PosixErrorOr<ProcMapsEntry> ParseProcMapsLine(absl::string_view line) {
   ASSIGN_OR_RETURN_ERRNO(map_entry.major, AtoiBase(device[0], 16));
   ASSIGN_OR_RETURN_ERRNO(map_entry.minor, AtoiBase(device[1], 16));
 
-  ASSIGN_OR_RETURN_ERRNO(map_entry.inode, Atoi<int64>(parts[4]));
+  ASSIGN_OR_RETURN_ERRNO(map_entry.inode, Atoi<int64_t>(parts[4]));
   if (parts.size() == 6) {
     // A filename is present. However, absl::StrSplit retained the whitespace
     // between the inode number and the filename.
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index f5096dd53..35aacb172 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -32,7 +32,7 @@ namespace testing {
 
 namespace {
 
-std::atomic<uint64> global_temp_file_number = ATOMIC_VAR_INIT(1);
+std::atomic<uint64_t> global_temp_file_number = ATOMIC_VAR_INIT(1);
 
 // Return a new temp filename, intended to be unique system-wide.
 //
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 51f4b4539..848504c88 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -79,7 +79,7 @@ bool IsRunningWithHostinet() {
 #endif  // defined(__x86_64__)
 
 CPUVendor GetCPUVendor() {
-  uint32 eax, ebx, ecx, edx;
+  uint32_t eax, ebx, ecx, edx;
   std::string vendor_str;
   // Get vendor string (issue CPUID with eax = 0)
   GETCPUID(eax, ebx, ecx, edx, 0, 0);
@@ -179,36 +179,36 @@ PosixErrorOr<std::vector<OpenFd>> GetOpenFDs() {
   return ret_fds;
 }
 
-PosixErrorOr<uint64> Links(const std::string& path) {
+PosixErrorOr<uint64_t> Links(const std::string& path) {
   struct stat st;
   if (stat(path.c_str(), &st)) {
     return PosixError(errno, absl::StrCat("Failed to stat ", path));
   }
-  return static_cast<uint64>(st.st_nlink);
+  return static_cast<uint64_t>(st.st_nlink);
 }
 
 void RandomizeBuffer(void* buffer, size_t len) {
   struct timespec ts = {};
   clock_gettime(CLOCK_MONOTONIC, &ts);
-  uint32 seed = static_cast<uint32>(ts.tv_nsec);
+  uint32_t seed = static_cast<uint32_t>(ts.tv_nsec);
   char* const buf = static_cast<char*>(buffer);
   for (size_t i = 0; i < len; i++) {
     buf[i] = rand_r(&seed) % 255;
   }
 }
 
-std::vector<std::vector<struct iovec>> GenerateIovecs(uint64 total_size,
+std::vector<std::vector<struct iovec>> GenerateIovecs(uint64_t total_size,
                                                       void* buf,
                                                       size_t buflen) {
   std::vector<std::vector<struct iovec>> result;
-  for (uint64 offset = 0; offset < total_size;) {
+  for (uint64_t offset = 0; offset < total_size;) {
     auto& iovec_array = *result.emplace(result.end());
 
     for (; offset < total_size && iovec_array.size() < IOV_MAX;
          offset += buflen) {
       struct iovec iov = {};
       iov.iov_base = buf;
-      iov.iov_len = std::min<uint64>(total_size - offset, buflen);
+      iov.iov_len = std::min<uint64_t>(total_size - offset, buflen);
       iovec_array.push_back(iov);
     }
   }
@@ -216,15 +216,15 @@ std::vector<std::vector<struct iovec>> GenerateIovecs(uint64 total_size,
   return result;
 }
 
-uint64 Megabytes(uint64 n) {
+uint64_t Megabytes(uint64_t n) {
   // Overflow check, upper 20 bits in n shouldn't be set.
   TEST_CHECK(!(0xfffff00000000000 & n));
   return n << 20;
 }
 
-bool Equivalent(uint64 current, uint64 target, double tolerance) {
+bool Equivalent(uint64_t current, uint64_t target, double tolerance) {
   auto abs_diff = target > current ? target - current : current - target;
-  return abs_diff <= static_cast<uint64>(tolerance * target);
+  return abs_diff <= static_cast<uint64_t>(tolerance * target);
 }
 
 }  // namespace testing
diff --git a/test/util/test_util.h b/test/util/test_util.h
index 6eb46ac76..b3235c7e3 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -264,7 +264,7 @@ std::ostream& operator<<(std::ostream& out, OpenFd const& ofd);
 PosixErrorOr<std::vector<OpenFd>> GetOpenFDs();
 
 // Returns the number of hard links to a path.
-PosixErrorOr<uint64> Links(const std::string& path);
+PosixErrorOr<uint64_t> Links(const std::string& path);
 
 namespace internal {
 
@@ -706,7 +706,7 @@ inline PosixErrorOr<T> Atoi(absl::string_view str) {
   return ret;
 }
 
-inline PosixErrorOr<uint64> AtoiBase(absl::string_view str, int base) {
+inline PosixErrorOr<uint64_t> AtoiBase(absl::string_view str, int base) {
   if (base > 255 || base < 2) {
     return PosixError(EINVAL, "Invalid Base");
   }
@@ -737,16 +737,16 @@ inline PosixErrorOr<float> Atof(absl::string_view str) {
 
 // Return the smallest number of iovec arrays that can be used to write
 // "total_bytes" number of bytes, each iovec writing one "buf".
-std::vector<std::vector<struct iovec>> GenerateIovecs(uint64 total_size,
+std::vector<std::vector<struct iovec>> GenerateIovecs(uint64_t total_size,
                                                       void* buf, size_t buflen);
 
 // Returns bytes in 'n' megabytes. Used for readability.
-uint64 Megabytes(uint64 n);
+uint64_t Megabytes(uint64_t n);
 
 // Predicate for checking that a value is within some tolerance of another
 // value. Returns true iff current is in the range [target * (1 - tolerance),
 // target * (1 + tolerance)].
-bool Equivalent(uint64 current, uint64 target, double tolerance);
+bool Equivalent(uint64_t current, uint64_t target, double tolerance);
 
 // Matcher wrapping the Equivalent predicate.
 MATCHER_P2(EquivalentWithin, target, tolerance,
@@ -756,7 +756,7 @@ MATCHER_P2(EquivalentWithin, target, tolerance,
   if (target == 0) {
     *result_listener << ::absl::StreamFormat("difference of infinity%%");
   } else {
-    int64 delta = static_cast<int64>(arg) - static_cast<int64>(target);
+    int64_t delta = static_cast<int64_t>(arg) - static_cast<int64_t>(target);
     double delta_percent =
         static_cast<double>(delta) / static_cast<double>(target) * 100;
     *result_listener << ::absl::StreamFormat("difference of %.2f%%",
diff --git a/test/util/test_util_test.cc b/test/util/test_util_test.cc
index 024304535..f42100374 100644
--- a/test/util/test_util_test.cc
+++ b/test/util/test_util_test.cc
@@ -171,7 +171,7 @@ MATCHER_P(IovecsListEq, expected, "") {
     return false;
   }
 
-  for (uint64 i = 0; i < expected.size(); ++i) {
+  for (uint64_t i = 0; i < expected.size(); ++i) {
     const std::vector<struct iovec>& actual_iovecs = arg[i];
     const std::vector<struct iovec>& expected_iovecs = expected[i];
     if (actual_iovecs.size() != expected_iovecs.size()) {
@@ -181,7 +181,7 @@ MATCHER_P(IovecsListEq, expected, "") {
       return false;
     }
 
-    for (uint64 j = 0; j < expected_iovecs.size(); ++j) {
+    for (uint64_t j = 0; j < expected_iovecs.size(); ++j) {
       const struct iovec& actual_iov = actual_iovecs[j];
       const struct iovec& expected_iov = expected_iovecs[j];
       if (actual_iov.iov_base != expected_iov.iov_base) {
-- 
cgit v1.2.3


From d59a3cc959cb14b0bed14b62e33ee4178b89b346 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 22 Jan 2020 05:51:57 +0000
Subject: Enable fault() syscall test on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I9b2b2e0d84946c10cf136abeef6c60642fa3b6ec
---
 test/syscalls/linux/fault.cc | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/fault.cc b/test/syscalls/linux/fault.cc
index f6e19026f..a85750382 100644
--- a/test/syscalls/linux/fault.cc
+++ b/test/syscalls/linux/fault.cc
@@ -37,6 +37,9 @@ int GetPcFromUcontext(ucontext_t* uc, uintptr_t* pc) {
 #elif defined(__i386__)
   *pc = uc->uc_mcontext.gregs[REG_EIP];
   return 1;
+#elif defined(__aarch64__)
+  *pc = uc->uc_mcontext.pc;
+  return 1;
 #else
   return 0;
 #endif
-- 
cgit v1.2.3


From 49e84b10e5ed7f94e6cbe003b9f7268e8235bb08 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 22 Jan 2020 06:22:18 +0000
Subject: Unify the kOLargeFile definition in syscall tests.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: Id9d6ae98305a4057d55d622ea4c3ac2228fea212
---
 test/syscalls/linux/fcntl.cc |  5 +----
 test/syscalls/linux/pipe.cc  |  6 +++---
 test/syscalls/linux/proc.cc  | 12 ------------
 test/util/fs_util.h          | 11 +++++++++++
 4 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 4f3aa81d6..421c15b87 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -31,6 +31,7 @@
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/cleanup.h"
 #include "test/util/eventfd_util.h"
+#include "test/util/fs_util.h"
 #include "test/util/multiprocess_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/save_util.h"
@@ -55,10 +56,6 @@ ABSL_FLAG(int32_t, socket_fd, -1,
 namespace gvisor {
 namespace testing {
 
-// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
-// because "it isn't needed", even though Linux can return it via F_GETFL.
-constexpr int kOLargeFile = 00100000;
-
 class FcntlLockTest : public ::testing::Test {
  public:
   void SetUp() override {
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index ac9b21b24..d8e19e910 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -25,6 +25,7 @@
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -144,11 +145,10 @@ TEST_P(PipeTest, Flags) {
 
   if (IsNamedPipe()) {
     // May be stubbed to zero; define locally.
-    constexpr int kLargefile = 0100000;
     EXPECT_THAT(fcntl(rfd_.get(), F_GETFL),
-                SyscallSucceedsWithValue(kLargefile | O_RDONLY));
+                SyscallSucceedsWithValue(kOLargeFile | O_RDONLY));
     EXPECT_THAT(fcntl(wfd_.get(), F_GETFL),
-                SyscallSucceedsWithValue(kLargefile | O_WRONLY));
+                SyscallSucceedsWithValue(kOLargeFile | O_WRONLY));
   } else {
     EXPECT_THAT(fcntl(rfd_.get(), F_GETFL), SyscallSucceedsWithValue(O_RDONLY));
     EXPECT_THAT(fcntl(wfd_.get(), F_GETFL), SyscallSucceedsWithValue(O_WRONLY));
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index bf9bb45d3..a03c1e43d 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -100,18 +100,6 @@ namespace {
 #define SUID_DUMP_ROOT 2
 #endif /* SUID_DUMP_ROOT */
 
-// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
-// because "it isn't needed", even though Linux can return it via F_GETFL.
-#if defined(__x86_64__) || defined(__i386__)
-constexpr int kOLargeFile = 00100000;
-#elif __aarch64__
-// The value originate from the Linux
-// kernel's arch/arm64/include/uapi/asm/fcntl.h.
-constexpr int kOLargeFile = 00400000;
-#else
-#error "Unknown architecture"
-#endif
-
 #if defined(__x86_64__) || defined(__i386__)
 // This list of "required" fields is taken from reading the file
 // arch/x86/kernel/cpu/proc.c and seeing which fields will be unconditionally
diff --git a/test/util/fs_util.h b/test/util/fs_util.h
index ee1b341d7..caf19b24d 100644
--- a/test/util/fs_util.h
+++ b/test/util/fs_util.h
@@ -26,6 +26,17 @@
 
 namespace gvisor {
 namespace testing {
+
+// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
+// because "it isn't needed", even though Linux can return it via F_GETFL.
+#if defined(__x86_64__)
+constexpr int kOLargeFile = 00100000;
+#elif defined(__aarch64__)
+constexpr int kOLargeFile = 00400000;
+#else
+#error "Unknown architecture"
+#endif
+
 // Returns a status or the current working directory.
 PosixErrorOr<std::string> GetCWD();
 
-- 
cgit v1.2.3


From d29e59af9fbd420e34378bcbf7ae543134070217 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 27 Jan 2020 10:04:07 -0800
Subject: Standardize on tools directory.

PiperOrigin-RevId: 291745021
---
 .bazelrc                                           |   8 +-
 BUILD                                              |  49 ++++++-
 benchmarks/defs.bzl                                |  18 ---
 benchmarks/harness/BUILD                           |  74 +++++-----
 benchmarks/harness/machine_producers/BUILD         |   4 +-
 benchmarks/runner/BUILD                            |  24 ++--
 benchmarks/tcp/BUILD                               |   3 +-
 benchmarks/workloads/ab/BUILD                      |  19 ++-
 benchmarks/workloads/absl/BUILD                    |  19 ++-
 benchmarks/workloads/curl/BUILD                    |   2 +-
 benchmarks/workloads/ffmpeg/BUILD                  |   2 +-
 benchmarks/workloads/fio/BUILD                     |  19 ++-
 benchmarks/workloads/httpd/BUILD                   |   2 +-
 benchmarks/workloads/iperf/BUILD                   |  19 ++-
 benchmarks/workloads/netcat/BUILD                  |   2 +-
 benchmarks/workloads/nginx/BUILD                   |   2 +-
 benchmarks/workloads/node/BUILD                    |   2 +-
 benchmarks/workloads/node_template/BUILD           |   2 +-
 benchmarks/workloads/redis/BUILD                   |   2 +-
 benchmarks/workloads/redisbenchmark/BUILD          |  19 ++-
 benchmarks/workloads/ruby/BUILD                    |   2 +-
 benchmarks/workloads/ruby_template/BUILD           |   2 +-
 benchmarks/workloads/sleep/BUILD                   |   2 +-
 benchmarks/workloads/sysbench/BUILD                |  19 ++-
 benchmarks/workloads/syscall/BUILD                 |  19 ++-
 benchmarks/workloads/tensorflow/BUILD              |   2 +-
 benchmarks/workloads/true/BUILD                    |   2 +-
 pkg/abi/BUILD                                      |   3 +-
 pkg/abi/linux/BUILD                                |   6 +-
 pkg/amutex/BUILD                                   |   6 +-
 pkg/atomicbitops/BUILD                             |   6 +-
 pkg/binary/BUILD                                   |   6 +-
 pkg/bits/BUILD                                     |   6 +-
 pkg/bpf/BUILD                                      |   6 +-
 pkg/compressio/BUILD                               |   6 +-
 pkg/control/client/BUILD                           |   3 +-
 pkg/control/server/BUILD                           |   3 +-
 pkg/cpuid/BUILD                                    |   8 +-
 pkg/eventchannel/BUILD                             |  16 +--
 pkg/fd/BUILD                                       |   6 +-
 pkg/fdchannel/BUILD                                |   8 +-
 pkg/fdnotifier/BUILD                               |   3 +-
 pkg/flipcall/BUILD                                 |   8 +-
 pkg/fspath/BUILD                                   |  13 +-
 pkg/gate/BUILD                                     |   4 +-
 pkg/goid/BUILD                                     |   6 +-
 pkg/ilist/BUILD                                    |   6 +-
 pkg/linewriter/BUILD                               |   6 +-
 pkg/log/BUILD                                      |   6 +-
 pkg/memutil/BUILD                                  |   3 +-
 pkg/metric/BUILD                                   |  23 +--
 pkg/p9/BUILD                                       |   6 +-
 pkg/p9/p9test/BUILD                                |   6 +-
 pkg/procid/BUILD                                   |   8 +-
 pkg/rand/BUILD                                     |   3 +-
 pkg/refs/BUILD                                     |   6 +-
 pkg/seccomp/BUILD                                  |   6 +-
 pkg/secio/BUILD                                    |   6 +-
 pkg/segment/test/BUILD                             |   6 +-
 pkg/sentry/BUILD                                   |   2 +
 pkg/sentry/arch/BUILD                              |  20 +--
 pkg/sentry/context/BUILD                           |   3 +-
 pkg/sentry/context/contexttest/BUILD               |   3 +-
 pkg/sentry/control/BUILD                           |   8 +-
 pkg/sentry/device/BUILD                            |   6 +-
 pkg/sentry/fs/BUILD                                |   6 +-
 pkg/sentry/fs/anon/BUILD                           |   3 +-
 pkg/sentry/fs/dev/BUILD                            |   3 +-
 pkg/sentry/fs/fdpipe/BUILD                         |   6 +-
 pkg/sentry/fs/filetest/BUILD                       |   3 +-
 pkg/sentry/fs/fsutil/BUILD                         |   6 +-
 pkg/sentry/fs/gofer/BUILD                          |   6 +-
 pkg/sentry/fs/host/BUILD                           |   6 +-
 pkg/sentry/fs/lock/BUILD                           |   6 +-
 pkg/sentry/fs/proc/BUILD                           |   6 +-
 pkg/sentry/fs/proc/device/BUILD                    |   3 +-
 pkg/sentry/fs/proc/seqfile/BUILD                   |   6 +-
 pkg/sentry/fs/ramfs/BUILD                          |   6 +-
 pkg/sentry/fs/sys/BUILD                            |   3 +-
 pkg/sentry/fs/timerfd/BUILD                        |   3 +-
 pkg/sentry/fs/tmpfs/BUILD                          |   6 +-
 pkg/sentry/fs/tty/BUILD                            |   6 +-
 pkg/sentry/fsimpl/ext/BUILD                        |   6 +-
 pkg/sentry/fsimpl/ext/benchmark/BUILD              |   2 +-
 pkg/sentry/fsimpl/ext/disklayout/BUILD             |   6 +-
 pkg/sentry/fsimpl/kernfs/BUILD                     |   6 +-
 pkg/sentry/fsimpl/proc/BUILD                       |   8 +-
 pkg/sentry/fsimpl/sys/BUILD                        |   6 +-
 pkg/sentry/fsimpl/testutil/BUILD                   |   5 +-
 pkg/sentry/fsimpl/tmpfs/BUILD                      |   8 +-
 pkg/sentry/hostcpu/BUILD                           |   6 +-
 pkg/sentry/hostmm/BUILD                            |   3 +-
 pkg/sentry/inet/BUILD                              |   3 +-
 pkg/sentry/kernel/BUILD                            |  24 +---
 pkg/sentry/kernel/auth/BUILD                       |   3 +-
 pkg/sentry/kernel/contexttest/BUILD                |   3 +-
 pkg/sentry/kernel/epoll/BUILD                      |   6 +-
 pkg/sentry/kernel/eventfd/BUILD                    |   6 +-
 pkg/sentry/kernel/fasync/BUILD                     |   3 +-
 pkg/sentry/kernel/futex/BUILD                      |   6 +-
 pkg/sentry/kernel/memevent/BUILD                   |  20 +--
 pkg/sentry/kernel/pipe/BUILD                       |   6 +-
 pkg/sentry/kernel/sched/BUILD                      |   6 +-
 pkg/sentry/kernel/semaphore/BUILD                  |   6 +-
 pkg/sentry/kernel/shm/BUILD                        |   3 +-
 pkg/sentry/kernel/signalfd/BUILD                   |   5 +-
 pkg/sentry/kernel/time/BUILD                       |   3 +-
 pkg/sentry/limits/BUILD                            |   6 +-
 pkg/sentry/loader/BUILD                            |   4 +-
 pkg/sentry/memmap/BUILD                            |   6 +-
 pkg/sentry/mm/BUILD                                |   6 +-
 pkg/sentry/pgalloc/BUILD                           |   6 +-
 pkg/sentry/platform/BUILD                          |   3 +-
 pkg/sentry/platform/interrupt/BUILD                |   6 +-
 pkg/sentry/platform/kvm/BUILD                      |   6 +-
 pkg/sentry/platform/kvm/testutil/BUILD             |   3 +-
 pkg/sentry/platform/ptrace/BUILD                   |   3 +-
 pkg/sentry/platform/ring0/BUILD                    |   3 +-
 pkg/sentry/platform/ring0/gen_offsets/BUILD        |   2 +-
 pkg/sentry/platform/ring0/pagetables/BUILD         |  16 +--
 pkg/sentry/platform/safecopy/BUILD                 |   6 +-
 pkg/sentry/safemem/BUILD                           |   6 +-
 pkg/sentry/sighandling/BUILD                       |   3 +-
 pkg/sentry/socket/BUILD                            |   3 +-
 pkg/sentry/socket/control/BUILD                    |   3 +-
 pkg/sentry/socket/hostinet/BUILD                   |   3 +-
 pkg/sentry/socket/netfilter/BUILD                  |   3 +-
 pkg/sentry/socket/netlink/BUILD                    |   3 +-
 pkg/sentry/socket/netlink/port/BUILD               |   6 +-
 pkg/sentry/socket/netlink/route/BUILD              |   3 +-
 pkg/sentry/socket/netlink/uevent/BUILD             |   3 +-
 pkg/sentry/socket/netstack/BUILD                   |   3 +-
 pkg/sentry/socket/unix/BUILD                       |   3 +-
 pkg/sentry/socket/unix/transport/BUILD             |   3 +-
 pkg/sentry/state/BUILD                             |   3 +-
 pkg/sentry/strace/BUILD                            |  20 +--
 pkg/sentry/syscalls/BUILD                          |   3 +-
 pkg/sentry/syscalls/linux/BUILD                    |   3 +-
 pkg/sentry/time/BUILD                              |   6 +-
 pkg/sentry/unimpl/BUILD                            |  21 +--
 pkg/sentry/uniqueid/BUILD                          |   3 +-
 pkg/sentry/usage/BUILD                             |   5 +-
 pkg/sentry/usermem/BUILD                           |   7 +-
 pkg/sentry/vfs/BUILD                               |   8 +-
 pkg/sentry/watchdog/BUILD                          |   3 +-
 pkg/sleep/BUILD                                    |   6 +-
 pkg/state/BUILD                                    |  17 +--
 pkg/state/statefile/BUILD                          |   6 +-
 pkg/sync/BUILD                                     |   6 +-
 pkg/sync/atomicptrtest/BUILD                       |   6 +-
 pkg/sync/seqatomictest/BUILD                       |   6 +-
 pkg/syserr/BUILD                                   |   3 +-
 pkg/syserror/BUILD                                 |   4 +-
 pkg/tcpip/BUILD                                    |   6 +-
 pkg/tcpip/adapters/gonet/BUILD                     |   6 +-
 pkg/tcpip/buffer/BUILD                             |   6 +-
 pkg/tcpip/checker/BUILD                            |   3 +-
 pkg/tcpip/hash/jenkins/BUILD                       |   6 +-
 pkg/tcpip/header/BUILD                             |   6 +-
 pkg/tcpip/iptables/BUILD                           |   3 +-
 pkg/tcpip/link/channel/BUILD                       |   3 +-
 pkg/tcpip/link/fdbased/BUILD                       |   6 +-
 pkg/tcpip/link/loopback/BUILD                      |   3 +-
 pkg/tcpip/link/muxed/BUILD                         |   6 +-
 pkg/tcpip/link/rawfile/BUILD                       |   3 +-
 pkg/tcpip/link/sharedmem/BUILD                     |   6 +-
 pkg/tcpip/link/sharedmem/pipe/BUILD                |   6 +-
 pkg/tcpip/link/sharedmem/queue/BUILD               |   6 +-
 pkg/tcpip/link/sniffer/BUILD                       |   3 +-
 pkg/tcpip/link/tun/BUILD                           |   3 +-
 pkg/tcpip/link/waitable/BUILD                      |   6 +-
 pkg/tcpip/network/BUILD                            |   2 +-
 pkg/tcpip/network/arp/BUILD                        |   4 +-
 pkg/tcpip/network/fragmentation/BUILD              |   6 +-
 pkg/tcpip/network/hash/BUILD                       |   3 +-
 pkg/tcpip/network/ipv4/BUILD                       |   4 +-
 pkg/tcpip/network/ipv6/BUILD                       |   6 +-
 pkg/tcpip/ports/BUILD                              |   6 +-
 pkg/tcpip/sample/tun_tcp_connect/BUILD             |   2 +-
 pkg/tcpip/sample/tun_tcp_echo/BUILD                |   2 +-
 pkg/tcpip/seqnum/BUILD                             |   3 +-
 pkg/tcpip/stack/BUILD                              |   6 +-
 pkg/tcpip/transport/icmp/BUILD                     |   3 +-
 pkg/tcpip/transport/packet/BUILD                   |   3 +-
 pkg/tcpip/transport/raw/BUILD                      |   3 +-
 pkg/tcpip/transport/tcp/BUILD                      |   4 +-
 pkg/tcpip/transport/tcp/testing/context/BUILD      |   3 +-
 pkg/tcpip/transport/tcpconntrack/BUILD             |   4 +-
 pkg/tcpip/transport/udp/BUILD                      |   4 +-
 pkg/tmutex/BUILD                                   |   6 +-
 pkg/unet/BUILD                                     |   6 +-
 pkg/urpc/BUILD                                     |   6 +-
 pkg/waiter/BUILD                                   |   6 +-
 runsc/BUILD                                        |  27 ++--
 runsc/boot/BUILD                                   |   5 +-
 runsc/boot/filter/BUILD                            |   3 +-
 runsc/boot/platforms/BUILD                         |   3 +-
 runsc/cgroup/BUILD                                 |   5 +-
 runsc/cmd/BUILD                                    |   5 +-
 runsc/console/BUILD                                |   3 +-
 runsc/container/BUILD                              |   5 +-
 runsc/container/test_app/BUILD                     |   4 +-
 runsc/criutil/BUILD                                |   3 +-
 runsc/dockerutil/BUILD                             |   3 +-
 runsc/fsgofer/BUILD                                |   9 +-
 runsc/fsgofer/filter/BUILD                         |   3 +-
 runsc/sandbox/BUILD                                |   3 +-
 runsc/specutils/BUILD                              |   5 +-
 runsc/testutil/BUILD                               |   3 +-
 runsc/version_test.sh                              |   2 +-
 scripts/common.sh                                  |   6 +-
 scripts/common_bazel.sh                            |  99 -------------
 scripts/common_build.sh                            |  99 +++++++++++++
 test/BUILD                                         |  45 +-----
 test/e2e/BUILD                                     |   5 +-
 test/image/BUILD                                   |   5 +-
 test/iptables/BUILD                                |   5 +-
 test/iptables/runner/BUILD                         |  12 +-
 test/root/BUILD                                    |   5 +-
 test/root/testdata/BUILD                           |   3 +-
 test/runtimes/BUILD                                |   4 +-
 test/runtimes/build_defs.bzl                       |   5 +-
 test/runtimes/images/proctor/BUILD                 |   4 +-
 test/syscalls/BUILD                                |   2 +-
 test/syscalls/build_defs.bzl                       |   6 +-
 test/syscalls/gtest/BUILD                          |   7 +-
 test/syscalls/linux/BUILD                          |  23 ++-
 test/syscalls/linux/arch_prctl.cc                  |   2 +
 test/syscalls/linux/rseq/BUILD                     |   5 +-
 .../linux/udp_socket_errqueue_test_case.cc         |   4 +
 test/uds/BUILD                                     |   3 +-
 test/util/BUILD                                    |  27 ++--
 test/util/save_util_linux.cc                       |   4 +
 test/util/save_util_other.cc                       |   4 +
 test/util/test_util_runfiles.cc                    |   4 +
 tools/BUILD                                        |   3 +
 tools/build/BUILD                                  |  10 ++
 tools/build/defs.bzl                               |  91 ++++++++++++
 tools/checkunsafe/BUILD                            |   3 +-
 tools/defs.bzl                                     | 154 +++++++++++++++++++++
 tools/go_generics/BUILD                            |   2 +-
 tools/go_generics/globals/BUILD                    |   4 +-
 tools/go_generics/go_merge/BUILD                   |   2 +-
 tools/go_generics/rules_tests/BUILD                |   2 +-
 tools/go_marshal/BUILD                             |   4 +-
 tools/go_marshal/README.md                         |  52 +------
 tools/go_marshal/analysis/BUILD                    |   5 +-
 tools/go_marshal/defs.bzl                          | 112 ++-------------
 tools/go_marshal/gomarshal/BUILD                   |   6 +-
 tools/go_marshal/gomarshal/generator.go            |  20 ++-
 tools/go_marshal/gomarshal/generator_tests.go      |   6 +-
 tools/go_marshal/main.go                           |  11 +-
 tools/go_marshal/marshal/BUILD                     |   5 +-
 tools/go_marshal/test/BUILD                        |   7 +-
 tools/go_marshal/test/external/BUILD               |   6 +-
 tools/go_stateify/BUILD                            |   2 +-
 tools/go_stateify/defs.bzl                         |  79 +----------
 tools/images/BUILD                                 |   2 +-
 tools/images/defs.bzl                              |   6 +-
 tools/issue_reviver/BUILD                          |   2 +-
 tools/issue_reviver/github/BUILD                   |   3 +-
 tools/issue_reviver/reviver/BUILD                  |   5 +-
 tools/workspace_status.sh                          |   2 +-
 vdso/BUILD                                         |  33 ++---
 264 files changed, 1012 insertions(+), 1380 deletions(-)
 delete mode 100644 benchmarks/defs.bzl
 delete mode 100755 scripts/common_bazel.sh
 create mode 100755 scripts/common_build.sh
 create mode 100644 tools/BUILD
 create mode 100644 tools/build/BUILD
 create mode 100644 tools/build/defs.bzl
 create mode 100644 tools/defs.bzl

(limited to 'test/syscalls/linux')

diff --git a/.bazelrc b/.bazelrc
index 9c35c5e7b..ef214bcfa 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -30,10 +30,10 @@ build:remote --auth_scope="https://www.googleapis.com/auth/cloud-source-tools"
 
 # Add a custom platform and toolchain that builds in a privileged docker
 # container, which is required by our syscall tests.
-build:remote --host_platform=//test:rbe_ubuntu1604
-build:remote --extra_toolchains=//test:cc-toolchain-clang-x86_64-default
-build:remote --extra_execution_platforms=//test:rbe_ubuntu1604
-build:remote --platforms=//test:rbe_ubuntu1604
+build:remote --host_platform=//:rbe_ubuntu1604
+build:remote --extra_toolchains=//:cc-toolchain-clang-x86_64-default
+build:remote --extra_execution_platforms=//:rbe_ubuntu1604
+build:remote --platforms=//:rbe_ubuntu1604
 build:remote --crosstool_top=@rbe_default//cc:toolchain
 build:remote --jobs=50
 build:remote --remote_timeout=3600
diff --git a/BUILD b/BUILD
index 76286174f..5fd929378 100644
--- a/BUILD
+++ b/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_path", "nogo")
 load("@bazel_gazelle//:def.bzl", "gazelle")
 
+package(licenses = ["notice"])
+
 # The sandbox filegroup is used for sandbox-internal dependencies.
 package_group(
     name = "sandbox",
@@ -49,9 +49,52 @@ gazelle(name = "gazelle")
 # live in the tools subdirectory (unless they are standard).
 nogo(
     name = "nogo",
-    config = "tools/nogo.js",
+    config = "//tools:nogo.js",
     visibility = ["//visibility:public"],
     deps = [
         "//tools/checkunsafe",
     ],
 )
+
+# We need to define a bazel platform and toolchain to specify dockerPrivileged
+# and dockerRunAsRoot options, they are required to run tests on the RBE
+# cluster in Kokoro.
+alias(
+    name = "rbe_ubuntu1604",
+    actual = ":rbe_ubuntu1604_r346485",
+)
+
+platform(
+    name = "rbe_ubuntu1604_r346485",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//tools/cpp:clang",
+        "@bazel_toolchains//constraints:xenial",
+        "@bazel_toolchains//constraints/sanitizers:support_msan",
+    ],
+    remote_execution_properties = """
+        properties: {
+          name: "container-image"
+          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50"
+        }
+        properties: {
+          name: "dockerAddCapabilities"
+          value: "SYS_ADMIN"
+        }
+        properties: {
+          name: "dockerPrivileged"
+          value: "true"
+        }
+    """,
+)
+
+toolchain(
+    name = "cc-toolchain-clang-x86_64-default",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+    ],
+    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/benchmarks/defs.bzl b/benchmarks/defs.bzl
deleted file mode 100644
index 79e6cdbc8..000000000
--- a/benchmarks/defs.bzl
+++ /dev/null
@@ -1,18 +0,0 @@
-"""Provides python helper functions."""
-
-load("@pydeps//:requirements.bzl", _requirement = "requirement")
-
-def filter_deps(deps = None):
-    if deps == None:
-        deps = []
-    return [dep for dep in deps if dep]
-
-def py_library(deps = None, **kwargs):
-    return native.py_library(deps = filter_deps(deps), **kwargs)
-
-def py_test(deps = None, **kwargs):
-    return native.py_test(deps = filter_deps(deps), **kwargs)
-
-def requirement(name, direct = True):
-    """ requirement returns the required dependency. """
-    return _requirement(name)
diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD
index 081a74243..52d4e42f8 100644
--- a/benchmarks/harness/BUILD
+++ b/benchmarks/harness/BUILD
@@ -1,4 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "requirement")
+load("//tools:defs.bzl", "py_library", "py_requirement")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -25,16 +25,16 @@ py_library(
     srcs = ["container.py"],
     deps = [
         "//benchmarks/workloads",
-        requirement("asn1crypto", False),
-        requirement("chardet", False),
-        requirement("certifi", False),
-        requirement("docker", True),
-        requirement("docker-pycreds", False),
-        requirement("idna", False),
-        requirement("ptyprocess", False),
-        requirement("requests", False),
-        requirement("urllib3", False),
-        requirement("websocket-client", False),
+        py_requirement("asn1crypto", False),
+        py_requirement("chardet", False),
+        py_requirement("certifi", False),
+        py_requirement("docker", True),
+        py_requirement("docker-pycreds", False),
+        py_requirement("idna", False),
+        py_requirement("ptyprocess", False),
+        py_requirement("requests", False),
+        py_requirement("urllib3", False),
+        py_requirement("websocket-client", False),
     ],
 )
 
@@ -47,17 +47,17 @@ py_library(
         "//benchmarks/harness:ssh_connection",
         "//benchmarks/harness:tunnel_dispatcher",
         "//benchmarks/harness/machine_mocks",
-        requirement("asn1crypto", False),
-        requirement("chardet", False),
-        requirement("certifi", False),
-        requirement("docker", True),
-        requirement("docker-pycreds", False),
-        requirement("idna", False),
-        requirement("ptyprocess", False),
-        requirement("requests", False),
-        requirement("six", False),
-        requirement("urllib3", False),
-        requirement("websocket-client", False),
+        py_requirement("asn1crypto", False),
+        py_requirement("chardet", False),
+        py_requirement("certifi", False),
+        py_requirement("docker", True),
+        py_requirement("docker-pycreds", False),
+        py_requirement("idna", False),
+        py_requirement("ptyprocess", False),
+        py_requirement("requests", False),
+        py_requirement("six", False),
+        py_requirement("urllib3", False),
+        py_requirement("websocket-client", False),
     ],
 )
 
@@ -66,10 +66,10 @@ py_library(
     srcs = ["ssh_connection.py"],
     deps = [
         "//benchmarks/harness",
-        requirement("bcrypt", False),
-        requirement("cffi", True),
-        requirement("paramiko", True),
-        requirement("cryptography", False),
+        py_requirement("bcrypt", False),
+        py_requirement("cffi", True),
+        py_requirement("paramiko", True),
+        py_requirement("cryptography", False),
     ],
 )
 
@@ -77,16 +77,16 @@ py_library(
     name = "tunnel_dispatcher",
     srcs = ["tunnel_dispatcher.py"],
     deps = [
-        requirement("asn1crypto", False),
-        requirement("chardet", False),
-        requirement("certifi", False),
-        requirement("docker", True),
-        requirement("docker-pycreds", False),
-        requirement("idna", False),
-        requirement("pexpect", True),
-        requirement("ptyprocess", False),
-        requirement("requests", False),
-        requirement("urllib3", False),
-        requirement("websocket-client", False),
+        py_requirement("asn1crypto", False),
+        py_requirement("chardet", False),
+        py_requirement("certifi", False),
+        py_requirement("docker", True),
+        py_requirement("docker-pycreds", False),
+        py_requirement("idna", False),
+        py_requirement("pexpect", True),
+        py_requirement("ptyprocess", False),
+        py_requirement("requests", False),
+        py_requirement("urllib3", False),
+        py_requirement("websocket-client", False),
     ],
 )
diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD
index c4e943882..48ea0ef39 100644
--- a/benchmarks/harness/machine_producers/BUILD
+++ b/benchmarks/harness/machine_producers/BUILD
@@ -1,4 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "requirement")
+load("//tools:defs.bzl", "py_library", "py_requirement")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -31,7 +31,7 @@ py_library(
     deps = [
         "//benchmarks/harness:machine",
         "//benchmarks/harness/machine_producers:machine_producer",
-        requirement("PyYAML", False),
+        py_requirement("PyYAML", False),
     ],
 )
 
diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD
index e1b2ea550..fae0ca800 100644
--- a/benchmarks/runner/BUILD
+++ b/benchmarks/runner/BUILD
@@ -1,4 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("//tools:defs.bzl", "py_library", "py_requirement", "py_test")
 
 package(licenses = ["notice"])
 
@@ -28,7 +28,7 @@ py_library(
         "//benchmarks/suites:startup",
         "//benchmarks/suites:sysbench",
         "//benchmarks/suites:syscall",
-        requirement("click", True),
+        py_requirement("click", True),
     ],
 )
 
@@ -36,7 +36,7 @@ py_library(
     name = "commands",
     srcs = ["commands.py"],
     deps = [
-        requirement("click", True),
+        py_requirement("click", True),
     ],
 )
 
@@ -50,14 +50,14 @@ py_test(
     ],
     deps = [
         ":runner",
-        requirement("click", True),
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("click", True),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
diff --git a/benchmarks/tcp/BUILD b/benchmarks/tcp/BUILD
index 735d7127f..d5e401acc 100644
--- a/benchmarks/tcp/BUILD
+++ b/benchmarks/tcp/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
-load("@rules_cc//cc:defs.bzl", "cc_binary")
+load("//tools:defs.bzl", "cc_binary", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/benchmarks/workloads/ab/BUILD b/benchmarks/workloads/ab/BUILD
index 4fc0ab735..4dd91ceb3 100644
--- a/benchmarks/workloads/ab/BUILD
+++ b/benchmarks/workloads/ab/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":ab",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/absl/BUILD b/benchmarks/workloads/absl/BUILD
index 61e010096..55dae3baa 100644
--- a/benchmarks/workloads/absl/BUILD
+++ b/benchmarks/workloads/absl/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":absl",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/curl/BUILD b/benchmarks/workloads/curl/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/curl/BUILD
+++ b/benchmarks/workloads/curl/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/ffmpeg/BUILD b/benchmarks/workloads/ffmpeg/BUILD
index be472dfb2..7c41ba631 100644
--- a/benchmarks/workloads/ffmpeg/BUILD
+++ b/benchmarks/workloads/ffmpeg/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/fio/BUILD b/benchmarks/workloads/fio/BUILD
index de257adad..7b78e8e75 100644
--- a/benchmarks/workloads/fio/BUILD
+++ b/benchmarks/workloads/fio/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":fio",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/httpd/BUILD b/benchmarks/workloads/httpd/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/httpd/BUILD
+++ b/benchmarks/workloads/httpd/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/iperf/BUILD b/benchmarks/workloads/iperf/BUILD
index 8832a996c..570f40148 100644
--- a/benchmarks/workloads/iperf/BUILD
+++ b/benchmarks/workloads/iperf/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":iperf",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/netcat/BUILD b/benchmarks/workloads/netcat/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/netcat/BUILD
+++ b/benchmarks/workloads/netcat/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/nginx/BUILD b/benchmarks/workloads/nginx/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/nginx/BUILD
+++ b/benchmarks/workloads/nginx/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/node/BUILD b/benchmarks/workloads/node/BUILD
index 71cd9f519..bfcf78cf9 100644
--- a/benchmarks/workloads/node/BUILD
+++ b/benchmarks/workloads/node/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/node_template/BUILD b/benchmarks/workloads/node_template/BUILD
index ca996f068..e142f082a 100644
--- a/benchmarks/workloads/node_template/BUILD
+++ b/benchmarks/workloads/node_template/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/redis/BUILD b/benchmarks/workloads/redis/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/redis/BUILD
+++ b/benchmarks/workloads/redis/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/redisbenchmark/BUILD b/benchmarks/workloads/redisbenchmark/BUILD
index f5994a815..f472a4443 100644
--- a/benchmarks/workloads/redisbenchmark/BUILD
+++ b/benchmarks/workloads/redisbenchmark/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":redisbenchmark",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/ruby/BUILD b/benchmarks/workloads/ruby/BUILD
index e37d77804..a3be4fe92 100644
--- a/benchmarks/workloads/ruby/BUILD
+++ b/benchmarks/workloads/ruby/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/ruby_template/BUILD b/benchmarks/workloads/ruby_template/BUILD
index 27f7c0c46..59443b14a 100644
--- a/benchmarks/workloads/ruby_template/BUILD
+++ b/benchmarks/workloads/ruby_template/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/sleep/BUILD b/benchmarks/workloads/sleep/BUILD
index eb0fb6165..a70873065 100644
--- a/benchmarks/workloads/sleep/BUILD
+++ b/benchmarks/workloads/sleep/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/sysbench/BUILD b/benchmarks/workloads/sysbench/BUILD
index fd2f8f03d..3834af7ed 100644
--- a/benchmarks/workloads/sysbench/BUILD
+++ b/benchmarks/workloads/sysbench/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":sysbench",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/syscall/BUILD b/benchmarks/workloads/syscall/BUILD
index 5100cbb21..dba4bb1e7 100644
--- a/benchmarks/workloads/syscall/BUILD
+++ b/benchmarks/workloads/syscall/BUILD
@@ -1,5 +1,4 @@
-load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
@@ -17,14 +16,14 @@ py_test(
     python_version = "PY3",
     deps = [
         ":syscall",
-        requirement("attrs", False),
-        requirement("atomicwrites", False),
-        requirement("more-itertools", False),
-        requirement("pathlib2", False),
-        requirement("pluggy", False),
-        requirement("py", False),
-        requirement("pytest", True),
-        requirement("six", False),
+        py_requirement("attrs", False),
+        py_requirement("atomicwrites", False),
+        py_requirement("more-itertools", False),
+        py_requirement("pathlib2", False),
+        py_requirement("pluggy", False),
+        py_requirement("py", False),
+        py_requirement("pytest", True),
+        py_requirement("six", False),
     ],
 )
 
diff --git a/benchmarks/workloads/tensorflow/BUILD b/benchmarks/workloads/tensorflow/BUILD
index 026c3b316..a7b7742f4 100644
--- a/benchmarks/workloads/tensorflow/BUILD
+++ b/benchmarks/workloads/tensorflow/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/benchmarks/workloads/true/BUILD b/benchmarks/workloads/true/BUILD
index 221c4b9a7..eba23d325 100644
--- a/benchmarks/workloads/true/BUILD
+++ b/benchmarks/workloads/true/BUILD
@@ -1,4 +1,4 @@
-load("@rules_pkg//:pkg.bzl", "pkg_tar")
+load("//tools:defs.bzl", "pkg_tar")
 
 package(
     default_visibility = ["//benchmarks:__subpackages__"],
diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index f5c08ea06..839f822eb 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,6 +9,5 @@ go_library(
         "abi_linux.go",
         "flag.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/abi",
     visibility = ["//:sandbox"],
 )
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 716ff22d2..1f3c0c687 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 # Package linux contains the constants and types needed to interface with a
 # Linux kernel. It should be used instead of syscall or golang.org/x/sys/unix
@@ -60,7 +59,6 @@ go_library(
         "wait.go",
         "xattr.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/abi/linux",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi",
@@ -73,7 +71,7 @@ go_test(
     name = "linux_test",
     size = "small",
     srcs = ["netfilter_test.go"],
-    embed = [":linux"],
+    library = ":linux",
     deps = [
         "//pkg/binary",
     ],
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index d99e37b40..9612f072e 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "amutex",
     srcs = ["amutex.go"],
-    importpath = "gvisor.dev/gvisor/pkg/amutex",
     visibility = ["//:sandbox"],
 )
 
@@ -14,6 +12,6 @@ go_test(
     name = "amutex_test",
     size = "small",
     srcs = ["amutex_test.go"],
-    embed = [":amutex"],
+    library = ":amutex",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index 6403c60c2..3948074ba 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "atomic_bitops_arm64.s",
         "atomic_bitops_common.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/atomicbitops",
     visibility = ["//:sandbox"],
 )
 
@@ -19,6 +17,6 @@ go_test(
     name = "atomicbitops_test",
     size = "small",
     srcs = ["atomic_bitops_test.go"],
-    embed = [":atomicbitops"],
+    library = ":atomicbitops",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/binary/BUILD b/pkg/binary/BUILD
index 543fb54bf..7ca2fda90 100644
--- a/pkg/binary/BUILD
+++ b/pkg/binary/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "binary",
     srcs = ["binary.go"],
-    importpath = "gvisor.dev/gvisor/pkg/binary",
     visibility = ["//:sandbox"],
 )
 
@@ -14,5 +12,5 @@ go_test(
     name = "binary_test",
     size = "small",
     srcs = ["binary_test.go"],
-    embed = [":binary"],
+    library = ":binary",
 )
diff --git a/pkg/bits/BUILD b/pkg/bits/BUILD
index 93b88a29a..63f4670d7 100644
--- a/pkg/bits/BUILD
+++ b/pkg/bits/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -15,7 +14,6 @@ go_library(
         "uint64_arch_arm64_asm.s",
         "uint64_arch_generic.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/bits",
     visibility = ["//:sandbox"],
 )
 
@@ -53,5 +51,5 @@ go_test(
     name = "bits_test",
     size = "small",
     srcs = ["uint64_test.go"],
-    embed = [":bits"],
+    library = ":bits",
 )
diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD
index fba5643e8..2a6977f85 100644
--- a/pkg/bpf/BUILD
+++ b/pkg/bpf/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +11,6 @@ go_library(
         "interpreter.go",
         "program_builder.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/bpf",
     visibility = ["//visibility:public"],
     deps = ["//pkg/abi/linux"],
 )
@@ -25,7 +23,7 @@ go_test(
         "interpreter_test.go",
         "program_builder_test.go",
     ],
-    embed = [":bpf"],
+    library = ":bpf",
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD
index 2bb581b18..1f75319a7 100644
--- a/pkg/compressio/BUILD
+++ b/pkg/compressio/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "compressio",
     srcs = ["compressio.go"],
-    importpath = "gvisor.dev/gvisor/pkg/compressio",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/binary",
@@ -18,5 +16,5 @@ go_test(
     name = "compressio_test",
     size = "medium",
     srcs = ["compressio_test.go"],
-    embed = [":compressio"],
+    library = ":compressio",
 )
diff --git a/pkg/control/client/BUILD b/pkg/control/client/BUILD
index 066d7b1a1..1b9e10ee7 100644
--- a/pkg/control/client/BUILD
+++ b/pkg/control/client/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -7,7 +7,6 @@ go_library(
     srcs = [
         "client.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/control/client",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/unet",
diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD
index adbd1e3f8..002d2ef44 100644
--- a/pkg/control/server/BUILD
+++ b/pkg/control/server/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "server",
     srcs = ["server.go"],
-    importpath = "gvisor.dev/gvisor/pkg/control/server",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index ed111fd2a..43a432190 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "cpu_amd64.s",
         "cpuid.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/cpuid",
     visibility = ["//:sandbox"],
     deps = ["//pkg/log"],
 )
@@ -18,7 +16,7 @@ go_test(
     name = "cpuid_test",
     size = "small",
     srcs = ["cpuid_test.go"],
-    embed = [":cpuid"],
+    library = ":cpuid",
 )
 
 go_test(
@@ -27,6 +25,6 @@ go_test(
     srcs = [
         "cpuid_parse_test.go",
     ],
-    embed = [":cpuid"],
+    library = ":cpuid",
     tags = ["manual"],
 )
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 9d68682c7..bee28b68d 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -1,6 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test", "proto_library")
 
 package(licenses = ["notice"])
 
@@ -10,7 +8,6 @@ go_library(
         "event.go",
         "rate.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/eventchannel",
     visibility = ["//:sandbox"],
     deps = [
         ":eventchannel_go_proto",
@@ -24,22 +21,15 @@ go_library(
 )
 
 proto_library(
-    name = "eventchannel_proto",
+    name = "eventchannel",
     srcs = ["event.proto"],
     visibility = ["//:sandbox"],
 )
 
-go_proto_library(
-    name = "eventchannel_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto",
-    proto = ":eventchannel_proto",
-    visibility = ["//:sandbox"],
-)
-
 go_test(
     name = "eventchannel_test",
     srcs = ["event_test.go"],
-    embed = [":eventchannel"],
+    library = ":eventchannel",
     deps = [
         "//pkg/sync",
         "@com_github_golang_protobuf//proto:go_default_library",
diff --git a/pkg/fd/BUILD b/pkg/fd/BUILD
index afa8f7659..872361546 100644
--- a/pkg/fd/BUILD
+++ b/pkg/fd/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "fd",
     srcs = ["fd.go"],
-    importpath = "gvisor.dev/gvisor/pkg/fd",
     visibility = ["//visibility:public"],
 )
 
@@ -14,5 +12,5 @@ go_test(
     name = "fd_test",
     size = "small",
     srcs = ["fd_test.go"],
-    embed = [":fd"],
+    library = ":fd",
 )
diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD
index b0478c672..d9104ef02 100644
--- a/pkg/fdchannel/BUILD
+++ b/pkg/fdchannel/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "fdchannel",
     srcs = ["fdchannel_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/fdchannel",
     visibility = ["//visibility:public"],
 )
 
@@ -14,6 +12,6 @@ go_test(
     name = "fdchannel_test",
     size = "small",
     srcs = ["fdchannel_test.go"],
-    embed = [":fdchannel"],
+    library = ":fdchannel",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/fdnotifier/BUILD b/pkg/fdnotifier/BUILD
index 91a202a30..235dcc490 100644
--- a/pkg/fdnotifier/BUILD
+++ b/pkg/fdnotifier/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "fdnotifier.go",
         "poll_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/fdnotifier",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/sync",
diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD
index 85bd83af1..9c5ad500b 100644
--- a/pkg/flipcall/BUILD
+++ b/pkg/flipcall/BUILD
@@ -1,7 +1,6 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "flipcall",
@@ -13,7 +12,6 @@ go_library(
         "io.go",
         "packet_window_allocator.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/flipcall",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi/linux",
@@ -30,6 +28,6 @@ go_test(
         "flipcall_example_test.go",
         "flipcall_test.go",
     ],
-    embed = [":flipcall"],
+    library = ":flipcall",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD
index ca540363c..ee84471b2 100644
--- a/pkg/fspath/BUILD
+++ b/pkg/fspath/BUILD
@@ -1,10 +1,8 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
 
 go_library(
     name = "fspath",
@@ -13,7 +11,6 @@ go_library(
         "builder_unsafe.go",
         "fspath.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/fspath",
 )
 
 go_test(
@@ -23,5 +20,5 @@ go_test(
         "builder_test.go",
         "fspath_test.go",
     ],
-    embed = [":fspath"],
+    library = ":fspath",
 )
diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD
index f22bd070d..dd3141143 100644
--- a/pkg/gate/BUILD
+++ b/pkg/gate/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -8,7 +7,6 @@ go_library(
     srcs = [
         "gate.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/gate",
     visibility = ["//visibility:public"],
 )
 
diff --git a/pkg/goid/BUILD b/pkg/goid/BUILD
index 5d31e5366..ea8d2422c 100644
--- a/pkg/goid/BUILD
+++ b/pkg/goid/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "goid_race.go",
         "goid_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/goid",
     visibility = ["//visibility:public"],
 )
 
@@ -22,5 +20,5 @@ go_test(
         "empty_test.go",
         "goid_test.go",
     ],
-    embed = [":goid"],
+    library = ":goid",
 )
diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index 34d2673ef..3f6eb07df 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
     srcs = [
         "interface_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/ilist",
     visibility = ["//visibility:public"],
 )
 
@@ -41,7 +39,7 @@ go_test(
         "list_test.go",
         "test_list.go",
     ],
-    embed = [":ilist"],
+    library = ":ilist",
 )
 
 go_template(
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index bcde6d308..41bf104d0 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "linewriter",
     srcs = ["linewriter.go"],
-    importpath = "gvisor.dev/gvisor/pkg/linewriter",
     visibility = ["//visibility:public"],
     deps = ["//pkg/sync"],
 )
@@ -14,5 +12,5 @@ go_library(
 go_test(
     name = "linewriter_test",
     srcs = ["linewriter_test.go"],
-    embed = [":linewriter"],
+    library = ":linewriter",
 )
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index 0df0f2849..935d06963 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +11,6 @@ go_library(
         "json_k8s.go",
         "log.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/log",
     visibility = [
         "//visibility:public",
     ],
@@ -29,5 +27,5 @@ go_test(
         "json_test.go",
         "log_test.go",
     ],
-    embed = [":log"],
+    library = ":log",
 )
diff --git a/pkg/memutil/BUILD b/pkg/memutil/BUILD
index 7b50e2b28..9d07d98b4 100644
--- a/pkg/memutil/BUILD
+++ b/pkg/memutil/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "memutil",
     srcs = ["memutil_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/memutil",
     visibility = ["//visibility:public"],
     deps = ["@org_golang_x_sys//unix:go_default_library"],
 )
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index 9145f3233..58305009d 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -1,14 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
+load("//tools:defs.bzl", "go_library", "go_test", "proto_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "metric",
     srcs = ["metric.go"],
-    importpath = "gvisor.dev/gvisor/pkg/metric",
     visibility = ["//:sandbox"],
     deps = [
         ":metric_go_proto",
@@ -19,28 +15,15 @@ go_library(
 )
 
 proto_library(
-    name = "metric_proto",
+    name = "metric",
     srcs = ["metric.proto"],
     visibility = ["//:sandbox"],
 )
 
-cc_proto_library(
-    name = "metric_cc_proto",
-    visibility = ["//:sandbox"],
-    deps = [":metric_proto"],
-)
-
-go_proto_library(
-    name = "metric_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/metric/metric_go_proto",
-    proto = ":metric_proto",
-    visibility = ["//:sandbox"],
-)
-
 go_test(
     name = "metric_test",
     srcs = ["metric_test.go"],
-    embed = [":metric"],
+    library = ":metric",
     deps = [
         ":metric_go_proto",
         "//pkg/eventchannel",
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index a3e05c96d..4ccc1de86 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -23,7 +22,6 @@ go_library(
         "transport_flipcall.go",
         "version.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/p9",
     deps = [
         "//pkg/fd",
         "//pkg/fdchannel",
@@ -47,7 +45,7 @@ go_test(
         "transport_test.go",
         "version_test.go",
     ],
-    embed = [":p9"],
+    library = ":p9",
     deps = [
         "//pkg/fd",
         "//pkg/unet",
diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD
index f4edd68b2..7ca67cb19 100644
--- a/pkg/p9/p9test/BUILD
+++ b/pkg/p9/p9test/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test")
+load("//tools:defs.bzl", "go_binary", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -64,7 +63,6 @@ go_library(
         "mocks.go",
         "p9test.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/p9/p9test",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/fd",
@@ -80,7 +78,7 @@ go_test(
     name = "client_test",
     size = "medium",
     srcs = ["client_test.go"],
-    embed = [":p9test"],
+    library = ":p9test",
     deps = [
         "//pkg/fd",
         "//pkg/p9",
diff --git a/pkg/procid/BUILD b/pkg/procid/BUILD
index b506813f0..aa3e3ac0b 100644
--- a/pkg/procid/BUILD
+++ b/pkg/procid/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +9,6 @@ go_library(
         "procid_amd64.s",
         "procid_arm64.s",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/procid",
     visibility = ["//visibility:public"],
 )
 
@@ -20,7 +18,7 @@ go_test(
     srcs = [
         "procid_test.go",
     ],
-    embed = [":procid"],
+    library = ":procid",
     deps = ["//pkg/sync"],
 )
 
@@ -31,6 +29,6 @@ go_test(
         "procid_net_test.go",
         "procid_test.go",
     ],
-    embed = [":procid"],
+    library = ":procid",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD
index 9d5b4859b..80b8ceb02 100644
--- a/pkg/rand/BUILD
+++ b/pkg/rand/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "rand.go",
         "rand_linux.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/rand",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/sync",
diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD
index 974d9af9b..74affc887 100644
--- a/pkg/refs/BUILD
+++ b/pkg/refs/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -23,7 +22,6 @@ go_library(
         "refcounter_state.go",
         "weak_ref_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/refs",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
@@ -35,6 +33,6 @@ go_test(
     name = "refs_test",
     size = "small",
     srcs = ["refcounter_test.go"],
-    embed = [":refs"],
+    library = ":refs",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index af94e944d..742c8b79b 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data", "go_test")
+load("//tools:defs.bzl", "go_binary", "go_embed_data", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -27,7 +26,6 @@ go_library(
         "seccomp_rules.go",
         "seccomp_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/seccomp",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi/linux",
@@ -43,7 +41,7 @@ go_test(
         "seccomp_test.go",
         ":victim_data",
     ],
-    embed = [":seccomp"],
+    library = ":seccomp",
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
diff --git a/pkg/secio/BUILD b/pkg/secio/BUILD
index 22abdc69f..60f63c7a6 100644
--- a/pkg/secio/BUILD
+++ b/pkg/secio/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "full_reader.go",
         "secio.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/secio",
     visibility = ["//pkg/sentry:internal"],
 )
 
@@ -17,5 +15,5 @@ go_test(
     name = "secio_test",
     size = "small",
     srcs = ["secio_test.go"],
-    embed = [":secio"],
+    library = ":secio",
 )
diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD
index a27c35e21..f2d8462d8 100644
--- a/pkg/segment/test/BUILD
+++ b/pkg/segment/test/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(
@@ -38,7 +37,6 @@ go_library(
         "int_set.go",
         "set_functions.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/segment/segment",
     deps = [
         "//pkg/state",
     ],
@@ -48,5 +46,5 @@ go_test(
     name = "segment_test",
     size = "small",
     srcs = ["segment_test.go"],
-    embed = [":segment"],
+    library = ":segment",
 )
diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD
index 2d6379c86..e8b794179 100644
--- a/pkg/sentry/BUILD
+++ b/pkg/sentry/BUILD
@@ -6,6 +6,8 @@ package(licenses = ["notice"])
 package_group(
     name = "internal",
     packages = [
+        "//cloud/gvisor/gopkg/sentry/...",
+        "//cloud/gvisor/sentry/...",
         "//pkg/sentry/...",
         "//runsc/...",
         # Code generated by go_marshal relies on go_marshal libraries.
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 65f22af2b..51ca09b24 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,6 +1,4 @@
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "proto_library")
 
 package(licenses = ["notice"])
 
@@ -27,7 +25,6 @@ go_library(
         "syscalls_amd64.go",
         "syscalls_arm64.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/arch",
     visibility = ["//:sandbox"],
     deps = [
         ":registers_go_proto",
@@ -44,20 +41,7 @@ go_library(
 )
 
 proto_library(
-    name = "registers_proto",
+    name = "registers",
     srcs = ["registers.proto"],
     visibility = ["//visibility:public"],
 )
-
-cc_proto_library(
-    name = "registers_cc_proto",
-    visibility = ["//visibility:public"],
-    deps = [":registers_proto"],
-)
-
-go_proto_library(
-    name = "registers_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto",
-    proto = ":registers_proto",
-    visibility = ["//visibility:public"],
-)
diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD
index 8dc1a77b1..e13a9ce20 100644
--- a/pkg/sentry/context/BUILD
+++ b/pkg/sentry/context/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "context",
     srcs = ["context.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/context",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/amutex",
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index 581e7aa96..f91a6d4ed 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "contexttest",
     testonly = 1,
     srcs = ["contexttest.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/context/contexttest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/memutil",
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 2561a6109..e69496477 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,9 +11,8 @@ go_library(
         "proc.go",
         "state.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/control",
     visibility = [
-        "//pkg/sentry:internal",
+        "//:sandbox",
     ],
     deps = [
         "//pkg/abi/linux",
@@ -40,7 +38,7 @@ go_test(
     name = "control_test",
     size = "small",
     srcs = ["proc_test.go"],
-    embed = [":control"],
+    library = ":control",
     deps = [
         "//pkg/log",
         "//pkg/sentry/kernel/time",
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index 97fa1512c..e403cbd8b 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -1,12 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "device",
     srcs = ["device.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/device",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -18,5 +16,5 @@ go_test(
     name = "device_test",
     size = "small",
     srcs = ["device_test.go"],
-    embed = [":device"],
+    library = ":device",
 )
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 7d5d72d5a..605d61dbe 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -44,7 +43,6 @@ go_library(
         "splice.go",
         "sync.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -129,7 +127,7 @@ go_test(
         "mount_test.go",
         "path_test.go",
     ],
-    embed = [":fs"],
+    library = ":fs",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD
index ae1c9cf76..c14e5405e 100644
--- a/pkg/sentry/fs/anon/BUILD
+++ b/pkg/sentry/fs/anon/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "anon.go",
         "device.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/anon",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index a0d9e8496..0c7247bd7 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -13,7 +13,6 @@ go_library(
         "random.go",
         "tty.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/dev",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index cc43de69d..25ef96299 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +9,6 @@ go_library(
         "pipe_opener.go",
         "pipe_state.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/fdpipe",
     imports = ["gvisor.dev/gvisor/pkg/sentry/fs"],
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -36,7 +34,7 @@ go_test(
         "pipe_opener_test.go",
         "pipe_test.go",
     ],
-    embed = [":fdpipe"],
+    library = ":fdpipe",
     deps = [
         "//pkg/fd",
         "//pkg/fdnotifier",
diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD
index 358dc2be3..9a7608cae 100644
--- a/pkg/sentry/fs/filetest/BUILD
+++ b/pkg/sentry/fs/filetest/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "filetest",
     testonly = 1,
     srcs = ["filetest.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/filetest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 945b6270d..9142f5bdf 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -75,7 +74,6 @@ go_library(
         "inode.go",
         "inode_cached.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/fsutil",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -106,7 +104,7 @@ go_test(
         "dirty_set_test.go",
         "inode_cached_test.go",
     ],
-    embed = [":fsutil"],
+    library = ":fsutil",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index fd870e8e1..cf48e7c03 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -22,7 +21,6 @@ go_library(
         "socket.go",
         "util.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/gofer",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -56,7 +54,7 @@ go_test(
     name = "gofer_test",
     size = "small",
     srcs = ["gofer_test.go"],
-    embed = [":gofer"],
+    library = ":gofer",
     deps = [
         "//pkg/p9",
         "//pkg/p9/p9test",
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 2b581aa69..f586f47c1 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -25,7 +24,6 @@ go_library(
         "util_arm64_unsafe.go",
         "util_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/host",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -69,7 +67,7 @@ go_test(
         "socket_test.go",
         "wait_test.go",
     ],
-    embed = [":host"],
+    library = ":host",
     deps = [
         "//pkg/fd",
         "//pkg/fdnotifier",
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index 2c332a82a..ae3331737 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -40,7 +39,6 @@ go_library(
         "lock_set.go",
         "lock_set_functions.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/lock",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
@@ -56,5 +54,5 @@ go_test(
         "lock_range_test.go",
         "lock_test.go",
     ],
-    embed = [":lock"],
+    library = ":lock",
 )
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index cb37c6c6b..b06bead41 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -27,7 +26,6 @@ go_library(
         "uptime.go",
         "version.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -63,7 +61,7 @@ go_test(
         "net_test.go",
         "sys_net_test.go",
     ],
-    embed = [":proc"],
+    library = ":proc",
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/fs/proc/device/BUILD b/pkg/sentry/fs/proc/device/BUILD
index 0394451d4..52c9aa93d 100644
--- a/pkg/sentry/fs/proc/device/BUILD
+++ b/pkg/sentry/fs/proc/device/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "device",
     srcs = ["device.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc/device",
     visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/sentry/device"],
 )
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index 38b246dff..310d8dd52 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -1,12 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "seqfile",
     srcs = ["seqfile.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -26,7 +24,7 @@ go_test(
     name = "seqfile_test",
     size = "small",
     srcs = ["seqfile_test.go"],
-    embed = [":seqfile"],
+    library = ":seqfile",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index 3fb7b0633..39c4b84f8 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "symlink.go",
         "tree.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ramfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -31,7 +29,7 @@ go_test(
     name = "ramfs_test",
     size = "small",
     srcs = ["tree_test.go"],
-    embed = [":ramfs"],
+    library = ":ramfs",
     deps = [
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index 25f0f124e..cc6b3bfbf 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -10,7 +10,6 @@ go_library(
         "fs.go",
         "sys.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/sys",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD
index a215c1b95..092668e8d 100644
--- a/pkg/sentry/fs/timerfd/BUILD
+++ b/pkg/sentry/fs/timerfd/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "timerfd",
     srcs = ["timerfd.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/timerfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 3400b940c..04776555f 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +11,6 @@ go_library(
         "inode_file.go",
         "tmpfs.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -41,7 +39,7 @@ go_test(
     name = "tmpfs_test",
     size = "small",
     srcs = ["file_test.go"],
-    embed = [":tmpfs"],
+    library = ":tmpfs",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index f6f60d0cf..29f804c6c 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -14,7 +13,6 @@ go_library(
         "slave.go",
         "terminal.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fs/tty",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -40,7 +38,7 @@ go_test(
     name = "tty_test",
     size = "small",
     srcs = ["tty_test.go"],
-    embed = [":tty"],
+    library = ":tty",
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context/contexttest",
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 903874141..a718920d5 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -32,7 +31,6 @@ go_library(
         "symlink.go",
         "utils.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -71,7 +69,7 @@ go_test(
         "//pkg/sentry/fsimpl/ext:assets/tiny.ext3",
         "//pkg/sentry/fsimpl/ext:assets/tiny.ext4",
     ],
-    embed = [":ext"],
+    library = ":ext",
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD
index 4fc8296ef..12f3990c1 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/BUILD
+++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index fcfaf5c3e..9bd9c76c0 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -23,7 +22,6 @@ go_library(
         "superblock_old.go",
         "test_utils.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -44,6 +42,6 @@ go_test(
         "inode_test.go",
         "superblock_test.go",
     ],
-    embed = [":disklayout"],
+    library = ":disklayout",
     deps = ["//pkg/sentry/kernel/time"],
 )
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 66d409785..7bf83ccba 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -1,8 +1,7 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_template_instance(
     name = "slot_list",
@@ -27,7 +26,6 @@ go_library(
         "slot_list.go",
         "symlink.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index c5b79fb38..3768f55b2 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -1,7 +1,6 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "proc",
@@ -15,7 +14,6 @@ go_library(
         "tasks_net.go",
         "tasks_sys.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc",
     deps = [
         "//pkg/abi/linux",
         "//pkg/log",
@@ -47,7 +45,7 @@ go_test(
         "tasks_sys_test.go",
         "tasks_test.go",
     ],
-    embed = [":proc"],
+    library = ":proc",
     deps = [
         "//pkg/abi/linux",
         "//pkg/fspath",
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index ee3c842bd..beda141f1 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -1,14 +1,12 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "sys",
     srcs = [
         "sys.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys",
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index 4e70d84a7..12053a5b6 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -1,6 +1,6 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "testutil",
@@ -9,7 +9,6 @@ go_library(
         "kernel.go",
         "testutil.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 691476b4f..857e98bc5 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -1,8 +1,7 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_template_instance(
     name = "dentry_list",
@@ -28,7 +27,6 @@ go_library(
         "symlink.go",
         "tmpfs.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs",
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
@@ -81,7 +79,7 @@ go_test(
         "regular_file_test.go",
         "stat_test.go",
     ],
-    embed = [":tmpfs"],
+    library = ":tmpfs",
     deps = [
         "//pkg/abi/linux",
         "//pkg/fspath",
diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD
index 359468ccc..e6933aa70 100644
--- a/pkg/sentry/hostcpu/BUILD
+++ b/pkg/sentry/hostcpu/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +9,6 @@ go_library(
         "getcpu_arm64.s",
         "hostcpu.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/hostcpu",
     visibility = ["//:sandbox"],
 )
 
@@ -18,5 +16,5 @@ go_test(
     name = "hostcpu_test",
     size = "small",
     srcs = ["hostcpu_test.go"],
-    embed = [":hostcpu"],
+    library = ":hostcpu",
 )
diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD
index 67831d5a1..a145a5ca3 100644
--- a/pkg/sentry/hostmm/BUILD
+++ b/pkg/sentry/hostmm/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "cgroup.go",
         "hostmm.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/hostmm",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/fd",
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 8d60ad4ad..aa621b724 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -12,7 +12,6 @@ go_library(
         "inet.go",
         "test_stack.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/inet",
     deps = [
         "//pkg/sentry/context",
         "//pkg/tcpip/stack",
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index ac85ba0c8..cebaccd92 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,8 +1,5 @@
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
+load("//tools:defs.bzl", "go_library", "go_test", "proto_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -78,26 +75,12 @@ go_template_instance(
 )
 
 proto_library(
-    name = "uncaught_signal_proto",
+    name = "uncaught_signal",
     srcs = ["uncaught_signal.proto"],
     visibility = ["//visibility:public"],
     deps = ["//pkg/sentry/arch:registers_proto"],
 )
 
-cc_proto_library(
-    name = "uncaught_signal_cc_proto",
-    visibility = ["//visibility:public"],
-    deps = [":uncaught_signal_proto"],
-)
-
-go_proto_library(
-    name = "uncaught_signal_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto",
-    proto = ":uncaught_signal_proto",
-    visibility = ["//visibility:public"],
-    deps = ["//pkg/sentry/arch:registers_go_proto"],
-)
-
 go_library(
     name = "kernel",
     srcs = [
@@ -156,7 +139,6 @@ go_library(
         "vdso.go",
         "version.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel",
     imports = [
         "gvisor.dev/gvisor/pkg/bpf",
         "gvisor.dev/gvisor/pkg/sentry/device",
@@ -227,7 +209,7 @@ go_test(
         "task_test.go",
         "timekeeper_test.go",
     ],
-    embed = [":kernel"],
+    library = ":kernel",
     deps = [
         "//pkg/abi",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 1aa72fa47..64537c9be 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -1,5 +1,5 @@
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -57,7 +57,6 @@ go_library(
         "id_map_set.go",
         "user_namespace.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/auth",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD
index 3a88a585c..daff608d7 100644
--- a/pkg/sentry/kernel/contexttest/BUILD
+++ b/pkg/sentry/kernel/contexttest/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "contexttest",
     testonly = 1,
     srcs = ["contexttest.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index c47f6b6fc..19e16ab3a 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -23,7 +22,6 @@ go_library(
         "epoll_list.go",
         "epoll_state.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/epoll",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/refs",
@@ -43,7 +41,7 @@ go_test(
     srcs = [
         "epoll_test.go",
     ],
-    embed = [":epoll"],
+    library = ":epoll",
     deps = [
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs/filetest",
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index c831fbab2..ee2d74864 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -1,12 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "eventfd",
     srcs = ["eventfd.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -26,7 +24,7 @@ go_test(
     name = "eventfd_test",
     size = "small",
     srcs = ["eventfd_test.go"],
-    embed = [":eventfd"],
+    library = ":eventfd",
     deps = [
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 6b36bc63e..b9126e946 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "fasync",
     srcs = ["fasync.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/fasync",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 50db443ce..f413d8ae2 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -34,7 +33,6 @@ go_library(
         "futex.go",
         "waiter_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/futex",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -51,7 +49,7 @@ go_test(
     name = "futex_test",
     size = "small",
     srcs = ["futex_test.go"],
-    embed = [":futex"],
+    library = ":futex",
     deps = [
         "//pkg/sentry/usermem",
         "//pkg/sync",
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
index 7f36252a9..4486848d2 100644
--- a/pkg/sentry/kernel/memevent/BUILD
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -1,13 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
+load("//tools:defs.bzl", "go_library", "proto_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "memevent",
     srcs = ["memory_events.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/memevent",
     visibility = ["//:sandbox"],
     deps = [
         ":memory_events_go_proto",
@@ -21,20 +18,7 @@ go_library(
 )
 
 proto_library(
-    name = "memory_events_proto",
+    name = "memory_events",
     srcs = ["memory_events.proto"],
     visibility = ["//visibility:public"],
 )
-
-cc_proto_library(
-    name = "memory_events_cc_proto",
-    visibility = ["//visibility:public"],
-    deps = [":memory_events_proto"],
-)
-
-go_proto_library(
-    name = "memory_events_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto",
-    proto = ":memory_events_proto",
-    visibility = ["//visibility:public"],
-)
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 5eeaeff66..2c7b6206f 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -30,7 +29,6 @@ go_library(
         "vfs.go",
         "writer.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/pipe",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -57,7 +55,7 @@ go_test(
         "node_test.go",
         "pipe_test.go",
     ],
-    embed = [":pipe"],
+    library = ":pipe",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
index 98ea7a0d8..1b82e087b 100644
--- a/pkg/sentry/kernel/sched/BUILD
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "cpuset.go",
         "sched.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/sched",
     visibility = ["//pkg/sentry:internal"],
 )
 
@@ -17,5 +15,5 @@ go_test(
     name = "sched_test",
     size = "small",
     srcs = ["cpuset_test.go"],
-    embed = [":sched"],
+    library = ":sched",
 )
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index 13a961594..76e19b551 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -22,7 +21,6 @@ go_library(
         "semaphore.go",
         "waiter_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -40,7 +38,7 @@ go_test(
     name = "semaphore_test",
     size = "small",
     srcs = ["semaphore_test.go"],
-    embed = [":semaphore"],
+    library = ":semaphore",
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 7321b22ed..5547c5abf 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "device.go",
         "shm.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/shm",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 89e4d84b1..5d44773d4 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "signalfd",
     srcs = ["signalfd.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 4e4de0512..d49594d9f 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "context.go",
         "time.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/time",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 9fa841e8b..67869757f 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +9,6 @@ go_library(
         "limits.go",
         "linux.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/limits",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
@@ -25,5 +23,5 @@ go_test(
     srcs = [
         "limits_test.go",
     ],
-    embed = [":limits"],
+    library = ":limits",
 )
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 2890393bd..d4ad2bd6c 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_embed_data", "go_library")
 
 package(licenses = ["notice"])
 
@@ -20,7 +19,6 @@ go_library(
         "vdso_state.go",
         ":vdso_bin",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/loader",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi",
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index 112794e9c..f9a65f086 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -37,7 +36,6 @@ go_library(
         "mapping_set_impl.go",
         "memmap.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/memmap",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
@@ -52,6 +50,6 @@ go_test(
     name = "memmap_test",
     size = "small",
     srcs = ["mapping_set_test.go"],
-    embed = [":memmap"],
+    library = ":memmap",
     deps = ["//pkg/sentry/usermem"],
 )
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 83e248431..bd6399fa2 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -96,7 +95,6 @@ go_library(
         "vma.go",
         "vma_set.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/mm",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -128,7 +126,7 @@ go_test(
     name = "mm_test",
     size = "small",
     srcs = ["mm_test.go"],
-    embed = [":mm"],
+    library = ":mm",
     deps = [
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index a9a2642c5..02385a3ce 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -60,7 +59,6 @@ go_library(
         "save_restore.go",
         "usage_set.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/pgalloc",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
@@ -82,6 +80,6 @@ go_test(
     name = "pgalloc_test",
     size = "small",
     srcs = ["pgalloc_test.go"],
-    embed = [":pgalloc"],
+    library = ":pgalloc",
     deps = ["//pkg/sentry/usermem"],
 )
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index 157bffa81..006450b2d 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,5 +1,5 @@
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -22,7 +22,6 @@ go_library(
         "mmap_min_addr.go",
         "platform.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD
index 85e882df9..83b385f14 100644
--- a/pkg/sentry/platform/interrupt/BUILD
+++ b/pkg/sentry/platform/interrupt/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -8,7 +7,6 @@ go_library(
     srcs = [
         "interrupt.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/interrupt",
     visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/sync"],
 )
@@ -17,5 +15,5 @@ go_test(
     name = "interrupt_test",
     size = "small",
     srcs = ["interrupt_test.go"],
-    embed = [":interrupt"],
+    library = ":interrupt",
 )
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 6a358d1d4..a4532a766 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -38,7 +37,6 @@ go_library(
         "physical_map_arm64.go",
         "virtual_map.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -65,7 +63,7 @@ go_test(
         "kvm_test.go",
         "virtual_map_test.go",
     ],
-    embed = [":kvm"],
+    library = ":kvm",
     tags = [
         "manual",
         "nogotsan",
diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD
index b0e45f159..f7605df8a 100644
--- a/pkg/sentry/platform/kvm/testutil/BUILD
+++ b/pkg/sentry/platform/kvm/testutil/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -12,6 +12,5 @@ go_library(
         "testutil_arm64.go",
         "testutil_arm64.s",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil",
     visibility = ["//pkg/sentry/platform/kvm:__pkg__"],
 )
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index cd13390c3..3bcc5e040 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -20,7 +20,6 @@ go_library(
         "subprocess_linux_unsafe.go",
         "subprocess_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ptrace",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index 87f4552b5..6dee8fcc5 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -74,7 +74,6 @@ go_library(
         "lib_arm64.s",
         "ring0.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/cpuid",
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index 42076fb04..147311ed3 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 387a7f6c3..8b5cdd6c1 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,17 +1,14 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test", "select_arch")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
 
-config_setting(
-    name = "aarch64",
-    constraint_values = ["@bazel_tools//platforms:aarch64"],
-)
-
 go_template(
     name = "generic_walker",
-    srcs = ["walker_amd64.go"],
+    srcs = select_arch(
+        amd64 = ["walker_amd64.go"],
+        arm64 = ["walker_amd64.go"],
+    ),
     opt_types = [
         "Visitor",
     ],
@@ -91,7 +88,6 @@ go_library(
         "walker_map.go",
         "walker_unmap.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables",
     visibility = [
         "//pkg/sentry/platform/kvm:__subpackages__",
         "//pkg/sentry/platform/ring0:__subpackages__",
@@ -111,6 +107,6 @@ go_test(
         "pagetables_test.go",
         "walker_check.go",
     ],
-    embed = [":pagetables"],
+    library = ":pagetables",
     deps = ["//pkg/sentry/usermem"],
 )
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index 6769cd0a5..b8747585b 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -17,7 +16,6 @@ go_library(
         "sighandler_amd64.s",
         "sighandler_arm64.s",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/platform/safecopy",
     visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/syserror"],
 )
@@ -27,5 +25,5 @@ go_test(
     srcs = [
         "safecopy_test.go",
     ],
-    embed = [":safecopy"],
+    library = ":safecopy",
 )
diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD
index 884020f7b..3ab76da97 100644
--- a/pkg/sentry/safemem/BUILD
+++ b/pkg/sentry/safemem/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "safemem.go",
         "seq_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/safemem",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/platform/safecopy",
@@ -25,5 +23,5 @@ go_test(
         "io_test.go",
         "seq_test.go",
     ],
-    embed = [":safemem"],
+    library = ":safemem",
 )
diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD
index f561670c7..6c38a3f44 100644
--- a/pkg/sentry/sighandling/BUILD
+++ b/pkg/sentry/sighandling/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "sighandling.go",
         "sighandling_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/sighandling",
     visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/abi/linux"],
 )
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 26176b10d..8e2b97afb 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "socket",
     srcs = ["socket.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 357517ed4..3850f6345 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "control",
     srcs = ["control.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/control",
     imports = [
         "gvisor.dev/gvisor/pkg/sentry/fs",
     ],
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 4c44c7c0f..42bf7be6a 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -12,7 +12,6 @@ go_library(
         "socket_unsafe.go",
         "stack.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/hostinet",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index b70047d81..ed34a8308 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -7,7 +7,6 @@ go_library(
     srcs = [
         "netfilter.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netfilter",
     # This target depends on netstack and should only be used by epsocket,
     # which is allowed to depend on netstack.
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 103933144..baaac13c6 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "provider.go",
         "socket.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index 2d9f4ba9b..3a22923d8 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -1,12 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "port",
     srcs = ["port.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port",
     visibility = ["//pkg/sentry:internal"],
     deps = ["//pkg/sync"],
 )
@@ -14,5 +12,5 @@ go_library(
 go_test(
     name = "port_test",
     srcs = ["port_test.go"],
-    embed = [":port"],
+    library = ":port",
 )
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 1d4912753..2137c7aeb 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "route",
     srcs = ["protocol.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/netlink/uevent/BUILD b/pkg/sentry/socket/netlink/uevent/BUILD
index 0777f3baf..73fbdf1eb 100644
--- a/pkg/sentry/socket/netlink/uevent/BUILD
+++ b/pkg/sentry/socket/netlink/uevent/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "uevent",
     srcs = ["protocol.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index f78784569..e3d1f90cb 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -11,7 +11,6 @@ go_library(
         "save_restore.go",
         "stack.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netstack",
     visibility = [
         "//pkg/sentry:internal",
     ],
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 5b6a154f6..bade18686 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "io.go",
         "unix.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/unix",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index d7ba95dff..4bdfc9208 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -25,7 +25,6 @@ go_library(
         "transport_message_list.go",
         "unix.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index 88765f4d6..0ea4aab8b 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "state_metadata.go",
         "state_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/state",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index aa1ac720c..ff6fafa63 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -1,6 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
+load("//tools:defs.bzl", "go_library", "proto_library")
 
 package(licenses = ["notice"])
 
@@ -21,7 +19,6 @@ go_library(
         "strace.go",
         "syscalls.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/strace",
     visibility = ["//:sandbox"],
     deps = [
         ":strace_go_proto",
@@ -42,20 +39,7 @@ go_library(
 )
 
 proto_library(
-    name = "strace_proto",
+    name = "strace",
     srcs = ["strace.proto"],
     visibility = ["//visibility:public"],
 )
-
-cc_proto_library(
-    name = "strace_cc_proto",
-    visibility = ["//visibility:public"],
-    deps = [":strace_proto"],
-)
-
-go_proto_library(
-    name = "strace_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto",
-    proto = ":strace_proto",
-    visibility = ["//visibility:public"],
-)
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
index 79d972202..b8d1bd415 100644
--- a/pkg/sentry/syscalls/BUILD
+++ b/pkg/sentry/syscalls/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "epoll.go",
         "syscalls.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 917f74e07..7d74e0f70 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -57,7 +57,6 @@ go_library(
         "sys_xattr.go",
         "timespec.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls/linux",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi",
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index 3cde3a0be..04f81a35b 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -31,7 +30,6 @@ go_library(
         "tsc_amd64.s",
         "tsc_arm64.s",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/time",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
@@ -48,5 +46,5 @@ go_test(
         "parameters_test.go",
         "sampler_test.go",
     ],
-    embed = [":time"],
+    library = ":time",
 )
diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD
index fc7614fff..370fa6ec5 100644
--- a/pkg/sentry/unimpl/BUILD
+++ b/pkg/sentry/unimpl/BUILD
@@ -1,34 +1,17 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@rules_cc//cc:defs.bzl", "cc_proto_library")
+load("//tools:defs.bzl", "go_library", "proto_library")
 
 package(licenses = ["notice"])
 
 proto_library(
-    name = "unimplemented_syscall_proto",
+    name = "unimplemented_syscall",
     srcs = ["unimplemented_syscall.proto"],
     visibility = ["//visibility:public"],
     deps = ["//pkg/sentry/arch:registers_proto"],
 )
 
-cc_proto_library(
-    name = "unimplemented_syscall_cc_proto",
-    visibility = ["//visibility:public"],
-    deps = [":unimplemented_syscall_proto"],
-)
-
-go_proto_library(
-    name = "unimplemented_syscall_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto",
-    proto = ":unimplemented_syscall_proto",
-    visibility = ["//visibility:public"],
-    deps = ["//pkg/sentry/arch:registers_go_proto"],
-)
-
 go_library(
     name = "unimpl",
     srcs = ["events.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/unimpl",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD
index 86a87edd4..e9c18f170 100644
--- a/pkg/sentry/uniqueid/BUILD
+++ b/pkg/sentry/uniqueid/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "uniqueid",
     srcs = ["context.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/uniqueid",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index 5518ac3d0..099315613 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -11,9 +11,8 @@ go_library(
         "memory_unsafe.go",
         "usage.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/usage",
     visibility = [
-        "//pkg/sentry:internal",
+        "//:sandbox",
     ],
     deps = [
         "//pkg/bits",
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index 684f59a6b..c8322e29e 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -29,7 +28,6 @@ go_library(
         "usermem_unsafe.go",
         "usermem_x86.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/usermem",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/atomicbitops",
@@ -38,7 +36,6 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/safemem",
         "//pkg/syserror",
-        "//pkg/tcpip/buffer",
     ],
 )
 
@@ -49,7 +46,7 @@ go_test(
         "addr_range_seq_test.go",
         "usermem_test.go",
     ],
-    embed = [":usermem"],
+    library = ":usermem",
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/safemem",
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 35c7be259..51acdc4e9 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -1,7 +1,6 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "vfs",
@@ -24,7 +23,6 @@ go_library(
         "testutil.go",
         "vfs.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/vfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -47,7 +45,7 @@ go_test(
         "file_description_impl_util_test.go",
         "mount_test.go",
     ],
-    embed = [":vfs"],
+    library = ":vfs",
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD
index 28f21f13d..1c5a1c9b6 100644
--- a/pkg/sentry/watchdog/BUILD
+++ b/pkg/sentry/watchdog/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "watchdog",
     srcs = ["watchdog.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/watchdog",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD
index a23c86fb1..e131455f7 100644
--- a/pkg/sleep/BUILD
+++ b/pkg/sleep/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +11,6 @@ go_library(
         "commit_noasm.go",
         "sleep_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sleep",
     visibility = ["//:sandbox"],
 )
 
@@ -22,5 +20,5 @@ go_test(
     srcs = [
         "sleep_test.go",
     ],
-    embed = [":sleep"],
+    library = ":sleep",
 )
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index be93750bf..921af9d63 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -1,6 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test", "proto_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -49,7 +47,7 @@ go_library(
         "state.go",
         "stats.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/state",
+    stateify = False,
     visibility = ["//:sandbox"],
     deps = [
         ":object_go_proto",
@@ -58,21 +56,14 @@ go_library(
 )
 
 proto_library(
-    name = "object_proto",
+    name = "object",
     srcs = ["object.proto"],
     visibility = ["//:sandbox"],
 )
 
-go_proto_library(
-    name = "object_go_proto",
-    importpath = "gvisor.dev/gvisor/pkg/state/object_go_proto",
-    proto = ":object_proto",
-    visibility = ["//:sandbox"],
-)
-
 go_test(
     name = "state_test",
     timeout = "long",
     srcs = ["state_test.go"],
-    embed = [":state"],
+    library = ":state",
 )
diff --git a/pkg/state/statefile/BUILD b/pkg/state/statefile/BUILD
index 8a865d229..e7581c09b 100644
--- a/pkg/state/statefile/BUILD
+++ b/pkg/state/statefile/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "statefile",
     srcs = ["statefile.go"],
-    importpath = "gvisor.dev/gvisor/pkg/state/statefile",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/binary",
@@ -18,6 +16,6 @@ go_test(
     name = "statefile_test",
     size = "small",
     srcs = ["statefile_test.go"],
-    embed = [":statefile"],
+    library = ":statefile",
     deps = ["//pkg/compressio"],
 )
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 97c4b3b1e..5340cf0d6 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template")
 
 package(
@@ -40,7 +39,6 @@ go_library(
         "syncutil.go",
         "tmutex_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sync",
 )
 
 go_test(
@@ -51,5 +49,5 @@ go_test(
         "seqcount_test.go",
         "tmutex_test.go",
     ],
-    embed = [":sync"],
+    library = ":sync",
 )
diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD
index 418eda29c..e97553254 100644
--- a/pkg/sync/atomicptrtest/BUILD
+++ b/pkg/sync/atomicptrtest/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -18,12 +17,11 @@ go_template_instance(
 go_library(
     name = "atomicptr",
     srcs = ["atomicptr_int_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sync/atomicptr",
 )
 
 go_test(
     name = "atomicptr_test",
     size = "small",
     srcs = ["atomicptr_test.go"],
-    embed = [":atomicptr"],
+    library = ":atomicptr",
 )
diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD
index eba21518d..5c38c783e 100644
--- a/pkg/sync/seqatomictest/BUILD
+++ b/pkg/sync/seqatomictest/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -18,7 +17,6 @@ go_template_instance(
 go_library(
     name = "seqatomic",
     srcs = ["seqatomic_int_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/sync/seqatomic",
     deps = [
         "//pkg/sync",
     ],
@@ -28,6 +26,6 @@ go_test(
     name = "seqatomic_test",
     size = "small",
     srcs = ["seqatomic_test.go"],
-    embed = [":seqatomic"],
+    library = ":seqatomic",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD
index 5665ad4ee..7d760344a 100644
--- a/pkg/syserr/BUILD
+++ b/pkg/syserr/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "netstack.go",
         "syserr.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/syserr",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi/linux",
diff --git a/pkg/syserror/BUILD b/pkg/syserror/BUILD
index bd3f9fd28..b13c15d9b 100644
--- a/pkg/syserror/BUILD
+++ b/pkg/syserror/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "syserror",
     srcs = ["syserror.go"],
-    importpath = "gvisor.dev/gvisor/pkg/syserror",
     visibility = ["//visibility:public"],
 )
 
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 23e4b09e7..26f7ba86b 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +11,6 @@ go_library(
         "time_unsafe.go",
         "timer.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sync",
@@ -25,7 +23,7 @@ go_test(
     name = "tcpip_test",
     size = "small",
     srcs = ["tcpip_test.go"],
-    embed = [":tcpip"],
+    library = ":tcpip",
 )
 
 go_test(
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index 3df7d18d3..a984f1712 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "gonet",
     srcs = ["gonet.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sync",
@@ -23,7 +21,7 @@ go_test(
     name = "gonet_test",
     size = "small",
     srcs = ["gonet_test.go"],
-    embed = [":gonet"],
+    library = ":gonet",
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
index d6c31bfa2..563bc78ea 100644
--- a/pkg/tcpip/buffer/BUILD
+++ b/pkg/tcpip/buffer/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "prependable.go",
         "view.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/buffer",
     visibility = ["//visibility:public"],
 )
 
@@ -17,5 +15,5 @@ go_test(
     name = "buffer_test",
     size = "small",
     srcs = ["view_test.go"],
-    embed = [":buffer"],
+    library = ":buffer",
 )
diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD
index b6fa6fc37..ed434807f 100644
--- a/pkg/tcpip/checker/BUILD
+++ b/pkg/tcpip/checker/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "checker",
     testonly = 1,
     srcs = ["checker.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/checker",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/hash/jenkins/BUILD b/pkg/tcpip/hash/jenkins/BUILD
index e648efa71..ff2719291 100644
--- a/pkg/tcpip/hash/jenkins/BUILD
+++ b/pkg/tcpip/hash/jenkins/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "jenkins",
     srcs = ["jenkins.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins",
     visibility = ["//visibility:public"],
 )
 
@@ -16,5 +14,5 @@ go_test(
     srcs = [
         "jenkins_test.go",
     ],
-    embed = [":jenkins"],
+    library = ":jenkins",
 )
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index cd747d100..9da0d71f8 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -24,7 +23,6 @@ go_library(
         "tcp.go",
         "udp.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/header",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
@@ -59,7 +57,7 @@ go_test(
         "eth_test.go",
         "ndp_test.go",
     ],
-    embed = [":header"],
+    library = ":header",
     deps = [
         "//pkg/tcpip",
         "@com_github_google_go-cmp//cmp:go_default_library",
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index 297eaccaf..d1b73cfdf 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "targets.go",
         "types.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 7dbc05754..3974c464e 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "channel",
     srcs = ["channel.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/channel",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 66cc53ed4..abe725548 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -13,7 +12,6 @@ go_library(
         "mmap_unsafe.go",
         "packet_dispatchers.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/fdbased",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sync",
@@ -30,7 +28,7 @@ go_test(
     name = "fdbased_test",
     size = "small",
     srcs = ["endpoint_test.go"],
-    embed = [":fdbased"],
+    library = ":fdbased",
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
index f35fcdff4..6bf3805b7 100644
--- a/pkg/tcpip/link/loopback/BUILD
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "loopback",
     srcs = ["loopback.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/loopback",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD
index 1ac7948b6..82b441b79 100644
--- a/pkg/tcpip/link/muxed/BUILD
+++ b/pkg/tcpip/link/muxed/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "muxed",
     srcs = ["injectable.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/muxed",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
@@ -19,7 +17,7 @@ go_test(
     name = "muxed_test",
     size = "small",
     srcs = ["injectable_test.go"],
-    embed = [":muxed"],
+    library = ":muxed",
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index d8211e93d..14b527bc2 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -12,7 +12,6 @@ go_library(
         "errors.go",
         "rawfile_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/rawfile",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
index 09165dd4c..13243ebbb 100644
--- a/pkg/tcpip/link/sharedmem/BUILD
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "sharedmem_unsafe.go",
         "tx.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
@@ -30,7 +28,7 @@ go_test(
     srcs = [
         "sharedmem_test.go",
     ],
-    embed = [":sharedmem"],
+    library = ":sharedmem",
     deps = [
         "//pkg/sync",
         "//pkg/tcpip",
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
index a0d4ad0be..87020ec08 100644
--- a/pkg/tcpip/link/sharedmem/pipe/BUILD
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -11,7 +10,6 @@ go_library(
         "rx.go",
         "tx.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe",
     visibility = ["//visibility:public"],
 )
 
@@ -20,6 +18,6 @@ go_test(
     srcs = [
         "pipe_test.go",
     ],
-    embed = [":pipe"],
+    library = ":pipe",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD
index 8c9234d54..3ba06af73 100644
--- a/pkg/tcpip/link/sharedmem/queue/BUILD
+++ b/pkg/tcpip/link/sharedmem/queue/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "rx.go",
         "tx.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
@@ -22,7 +20,7 @@ go_test(
     srcs = [
         "queue_test.go",
     ],
-    embed = [":queue"],
+    library = ":queue",
     deps = [
         "//pkg/tcpip/link/sharedmem/pipe",
     ],
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index d6ae0368a..230a8d53a 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,7 +8,6 @@ go_library(
         "pcap.go",
         "sniffer.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sniffer",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index a71a493fc..e5096ea38 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -1,10 +1,9 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "tun",
     srcs = ["tun_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/tun",
     visibility = ["//visibility:public"],
 )
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
index 134837943..0956d2c65 100644
--- a/pkg/tcpip/link/waitable/BUILD
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -8,7 +7,6 @@ go_library(
     srcs = [
         "waitable.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/link/waitable",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/gate",
@@ -23,7 +21,7 @@ go_test(
     srcs = [
         "waitable_test.go",
     ],
-    embed = [":waitable"],
+    library = ":waitable",
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index 9d16ff8c9..6a4839fb8 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_test")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index e7617229b..eddf7b725 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "arp",
     srcs = ["arp.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/network/arp",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index ed16076fd..d1c728ccf 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -24,7 +23,6 @@ go_library(
         "reassembler.go",
         "reassembler_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/network/fragmentation",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/log",
@@ -42,6 +40,6 @@ go_test(
         "fragmentation_test.go",
         "reassembler_test.go",
     ],
-    embed = [":fragmentation"],
+    library = ":fragmentation",
     deps = ["//pkg/tcpip/buffer"],
 )
diff --git a/pkg/tcpip/network/hash/BUILD b/pkg/tcpip/network/hash/BUILD
index e6db5c0b0..872165866 100644
--- a/pkg/tcpip/network/hash/BUILD
+++ b/pkg/tcpip/network/hash/BUILD
@@ -1,11 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "hash",
     srcs = ["hash.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/network/hash",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/rand",
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 4e2aae9a3..0fef2b1f1 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "icmp.go",
         "ipv4.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/network/ipv4",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index e4e273460..fb11874c6 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "icmp.go",
         "ipv6.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/network/ipv6",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip",
@@ -27,7 +25,7 @@ go_test(
         "ipv6_test.go",
         "ndp_test.go",
     ],
-    embed = [":ipv6"],
+    library = ":ipv6",
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index a6ef3bdcc..2bad05a2e 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,12 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "ports",
     srcs = ["ports.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/ports",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sync",
@@ -17,7 +15,7 @@ go_library(
 go_test(
     name = "ports_test",
     srcs = ["ports_test.go"],
-    embed = [":ports"],
+    library = ":ports",
     deps = [
         "//pkg/tcpip",
     ],
diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD
index d7496fde6..cf0a5fefe 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD
index 875561566..43264b76d 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD
index b31ddba2f..45f503845 100644
--- a/pkg/tcpip/seqnum/BUILD
+++ b/pkg/tcpip/seqnum/BUILD
@@ -1,10 +1,9 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "seqnum",
     srcs = ["seqnum.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/seqnum",
     visibility = ["//visibility:public"],
 )
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 783351a69..f5b750046 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -30,7 +29,6 @@ go_library(
         "stack_global_state.go",
         "transport_demuxer.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/stack",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/ilist",
@@ -81,7 +79,7 @@ go_test(
     name = "stack_test",
     size = "small",
     srcs = ["linkaddrcache_test.go"],
-    embed = [":stack"],
+    library = ":stack",
     deps = [
         "//pkg/sleep",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index 3aa23d529..ac18ec5b1 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -1,5 +1,5 @@
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -23,7 +23,6 @@ go_library(
         "icmp_packet_list.go",
         "protocol.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/icmp",
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD
index 4858d150c..d22de6b26 100644
--- a/pkg/tcpip/transport/packet/BUILD
+++ b/pkg/tcpip/transport/packet/BUILD
@@ -1,5 +1,5 @@
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -22,7 +22,6 @@ go_library(
         "endpoint_state.go",
         "packet_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/packet",
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index 2f2131ff7..c9baf4600 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -1,5 +1,5 @@
+load("//tools:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -23,7 +23,6 @@ go_library(
         "protocol.go",
         "raw_packet_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/raw",
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 0e3ab05ad..4acd9fb9a 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -55,7 +54,6 @@ go_library(
         "tcp_segment_list.go",
         "timer.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcp",
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/transport/tcp/testing/context/BUILD b/pkg/tcpip/transport/tcp/testing/context/BUILD
index b33ec2087..ce6a2c31d 100644
--- a/pkg/tcpip/transport/tcp/testing/context/BUILD
+++ b/pkg/tcpip/transport/tcp/testing/context/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "context",
     testonly = 1,
     srcs = ["context.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context",
     visibility = [
         "//visibility:public",
     ],
diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD
index 43fcc27f0..3ad6994a7 100644
--- a/pkg/tcpip/transport/tcpconntrack/BUILD
+++ b/pkg/tcpip/transport/tcpconntrack/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "tcpconntrack",
     srcs = ["tcp_conntrack.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 57ff123e3..adc908e24 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -25,7 +24,6 @@ go_library(
         "protocol.go",
         "udp_packet_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/udp",
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
index 07778e4f7..2dcba84ae 100644
--- a/pkg/tmutex/BUILD
+++ b/pkg/tmutex/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "tmutex",
     srcs = ["tmutex.go"],
-    importpath = "gvisor.dev/gvisor/pkg/tmutex",
     visibility = ["//:sandbox"],
 )
 
@@ -14,6 +12,6 @@ go_test(
     name = "tmutex_test",
     size = "medium",
     srcs = ["tmutex_test.go"],
-    embed = [":tmutex"],
+    library = ":tmutex",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD
index d1885ae66..a86501fa2 100644
--- a/pkg/unet/BUILD
+++ b/pkg/unet/BUILD
@@ -1,5 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -9,7 +8,6 @@ go_library(
         "unet.go",
         "unet_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/unet",
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/gate",
@@ -23,6 +21,6 @@ go_test(
     srcs = [
         "unet_test.go",
     ],
-    embed = [":unet"],
+    library = ":unet",
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD
index b8fdc3125..850c34ed0 100644
--- a/pkg/urpc/BUILD
+++ b/pkg/urpc/BUILD
@@ -1,12 +1,10 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "urpc",
     srcs = ["urpc.go"],
-    importpath = "gvisor.dev/gvisor/pkg/urpc",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/fd",
@@ -20,6 +18,6 @@ go_test(
     name = "urpc_test",
     size = "small",
     srcs = ["urpc_test.go"],
-    embed = [":urpc"],
+    library = ":urpc",
     deps = ["//pkg/unet"],
 )
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 1c6890e52..852480a09 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -1,6 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -22,7 +21,6 @@ go_library(
         "waiter.go",
         "waiter_list.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/waiter",
     visibility = ["//visibility:public"],
     deps = ["//pkg/sync"],
 )
@@ -33,5 +31,5 @@ go_test(
     srcs = [
         "waiter_test.go",
     ],
-    embed = [":waiter"],
+    library = ":waiter",
 )
diff --git a/runsc/BUILD b/runsc/BUILD
index e5587421d..b35b41d81 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,7 +1,6 @@
-package(licenses = ["notice"])  # Apache 2.0
+load("//tools:defs.bzl", "go_binary", "pkg_deb", "pkg_tar")
 
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
-load("@rules_pkg//:pkg.bzl", "pkg_deb", "pkg_tar")
+package(licenses = ["notice"])
 
 go_binary(
     name = "runsc",
@@ -9,7 +8,7 @@ go_binary(
         "main.go",
         "version.go",
     ],
-    pure = "on",
+    pure = True,
     visibility = [
         "//visibility:public",
     ],
@@ -26,10 +25,12 @@ go_binary(
 )
 
 # The runsc-race target is a race-compatible BUILD target. This must be built
-# via "bazel build --features=race //runsc:runsc-race", since the race feature
-# must apply to all dependencies due a bug in gazelle file selection.  The pure
-# attribute must be off because the race detector requires linking with non-Go
-# components, although we still require a static binary.
+# via: bazel build --features=race //runsc:runsc-race
+#
+# This is neccessary because the race feature must apply to all dependencies
+# due a bug in gazelle file selection.  The pure attribute must be off because
+# the race detector requires linking with non-Go components, although we still
+# require a static binary.
 #
 # Note that in the future this might be convertible to a compatible target by
 # using the pure and static attributes within a select function, but select is
@@ -42,7 +43,7 @@ go_binary(
         "main.go",
         "version.go",
     ],
-    static = "on",
+    static = True,
     visibility = [
         "//visibility:public",
     ],
@@ -82,7 +83,12 @@ genrule(
     # because they are assumes to be hermetic).
     srcs = [":runsc"],
     outs = ["version.txt"],
-    cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@",
+    # Note that the little dance here is necessary because files in the $(SRCS)
+    # attribute are not executable by default, and we can't touch in place.
+    cmd = "cp $(location :runsc) $(@D)/runsc && \
+        chmod a+x $(@D)/runsc && \
+        $(@D)/runsc -version | grep version | sed 's/^[^0-9]*//' > $@ && \
+        rm -f $(@D)/runsc",
     stamp = 1,
 )
 
@@ -109,5 +115,6 @@ sh_test(
     name = "version_test",
     size = "small",
     srcs = ["version_test.sh"],
+    args = ["$(location :runsc)"],
     data = [":runsc"],
 )
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 3e20f8f2f..f3ebc0231 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -23,7 +23,6 @@ go_library(
         "strace.go",
         "user.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/boot",
     visibility = [
         "//runsc:__subpackages__",
         "//test:__subpackages__",
@@ -107,7 +106,7 @@ go_test(
         "loader_test.go",
         "user_test.go",
     ],
-    embed = [":boot"],
+    library = ":boot",
     deps = [
         "//pkg/control/server",
         "//pkg/log",
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index 3a9dcfc04..ce30f6c53 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -13,7 +13,6 @@ go_library(
         "extra_filters_race.go",
         "filter.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/boot/filter",
     visibility = [
         "//runsc/boot:__subpackages__",
     ],
diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD
index 03391cdca..77774f43c 100644
--- a/runsc/boot/platforms/BUILD
+++ b/runsc/boot/platforms/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "platforms",
     srcs = ["platforms.go"],
-    importpath = "gvisor.dev/gvisor/runsc/boot/platforms",
     visibility = [
         "//runsc:__subpackages__",
     ],
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index d6165f9e5..d4c7bdfbb 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "cgroup",
     srcs = ["cgroup.go"],
-    importpath = "gvisor.dev/gvisor/runsc/cgroup",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
@@ -19,6 +18,6 @@ go_test(
     name = "cgroup_test",
     size = "small",
     srcs = ["cgroup_test.go"],
-    embed = [":cgroup"],
+    library = ":cgroup",
     tags = ["local"],
 )
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index b94bc4fa0..09aa46434 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -34,7 +34,6 @@ go_library(
         "syscalls.go",
         "wait.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/cmd",
     visibility = [
         "//runsc:__subpackages__",
     ],
@@ -73,7 +72,7 @@ go_test(
     data = [
         "//runsc",
     ],
-    embed = [":cmd"],
+    library = ":cmd",
     deps = [
         "//pkg/abi/linux",
         "//pkg/log",
diff --git a/runsc/console/BUILD b/runsc/console/BUILD
index e623c1a0f..06924bccd 100644
--- a/runsc/console/BUILD
+++ b/runsc/console/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -7,7 +7,6 @@ go_library(
     srcs = [
         "console.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/console",
     visibility = [
         "//runsc:__subpackages__",
     ],
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 6dea179e4..e21431e4c 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +10,6 @@ go_library(
         "state_file.go",
         "status.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/container",
     visibility = [
         "//runsc:__subpackages__",
         "//test:__subpackages__",
@@ -42,7 +41,7 @@ go_test(
         "//runsc",
         "//runsc/container/test_app",
     ],
-    embed = [":container"],
+    library = ":container",
     shard_count = 5,
     tags = [
         "requires-kvm",
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
index bfd338bb6..e200bafd9 100644
--- a/runsc/container/test_app/BUILD
+++ b/runsc/container/test_app/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,7 @@ go_binary(
         "fds.go",
         "test_app.go",
     ],
-    pure = "on",
+    pure = True,
     visibility = ["//runsc/container:__pkg__"],
     deps = [
         "//pkg/unet",
diff --git a/runsc/criutil/BUILD b/runsc/criutil/BUILD
index 558133a0e..8a571a000 100644
--- a/runsc/criutil/BUILD
+++ b/runsc/criutil/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "criutil",
     testonly = 1,
     srcs = ["criutil.go"],
-    importpath = "gvisor.dev/gvisor/runsc/criutil",
     visibility = ["//:sandbox"],
     deps = ["//runsc/testutil"],
 )
diff --git a/runsc/dockerutil/BUILD b/runsc/dockerutil/BUILD
index 0e0423504..8621af901 100644
--- a/runsc/dockerutil/BUILD
+++ b/runsc/dockerutil/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "dockerutil",
     testonly = 1,
     srcs = ["dockerutil.go"],
-    importpath = "gvisor.dev/gvisor/runsc/dockerutil",
     visibility = ["//:sandbox"],
     deps = [
         "//runsc/testutil",
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index a9582d92b..64a406ae2 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,10 +10,7 @@ go_library(
         "fsgofer_arm64_unsafe.go",
         "fsgofer_unsafe.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/fsgofer",
-    visibility = [
-        "//runsc:__subpackages__",
-    ],
+    visibility = ["//runsc:__subpackages__"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/fd",
@@ -30,7 +27,7 @@ go_test(
     name = "fsgofer_test",
     size = "small",
     srcs = ["fsgofer_test.go"],
-    embed = [":fsgofer"],
+    library = ":fsgofer",
     deps = [
         "//pkg/log",
         "//pkg/p9",
diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD
index bac73f89d..82b48ef32 100644
--- a/runsc/fsgofer/filter/BUILD
+++ b/runsc/fsgofer/filter/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -13,7 +13,6 @@ go_library(
         "extra_filters_race.go",
         "filter.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/fsgofer/filter",
     visibility = [
         "//runsc:__subpackages__",
     ],
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index ddbc37456..c95d50294 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -9,7 +9,6 @@ go_library(
         "network_unsafe.go",
         "sandbox.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/sandbox",
     visibility = [
         "//runsc:__subpackages__",
     ],
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 205638803..4ccd77f63 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +10,6 @@ go_library(
         "namespace.go",
         "specutils.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/specutils",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
@@ -28,6 +27,6 @@ go_test(
     name = "specutils_test",
     size = "small",
     srcs = ["specutils_test.go"],
-    embed = [":specutils"],
+    library = ":specutils",
     deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"],
 )
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index 3c3027cb5..f845120b0 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,7 +6,6 @@ go_library(
     name = "testutil",
     testonly = 1,
     srcs = ["testutil.go"],
-    importpath = "gvisor.dev/gvisor/runsc/testutil",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
diff --git a/runsc/version_test.sh b/runsc/version_test.sh
index cc0ca3f05..747350654 100755
--- a/runsc/version_test.sh
+++ b/runsc/version_test.sh
@@ -16,7 +16,7 @@
 
 set -euf -x -o pipefail
 
-readonly runsc="${TEST_SRCDIR}/__main__/runsc/linux_amd64_pure_stripped/runsc"
+readonly runsc="$1"
 readonly version=$($runsc --version)
 
 # Version should should not match VERSION, which is the default and which will
diff --git a/scripts/common.sh b/scripts/common.sh
index fdb1aa142..cd91b9f8e 100755
--- a/scripts/common.sh
+++ b/scripts/common.sh
@@ -16,11 +16,7 @@
 
 set -xeou pipefail
 
-if [[ -f $(dirname $0)/common_google.sh ]]; then
-  source $(dirname $0)/common_google.sh
-else
-  source $(dirname $0)/common_bazel.sh
-fi
+source $(dirname $0)/common_build.sh
 
 # Ensure it attempts to collect logs in all cases.
 trap collect_logs EXIT
diff --git a/scripts/common_bazel.sh b/scripts/common_bazel.sh
deleted file mode 100755
index a473a88a4..000000000
--- a/scripts/common_bazel.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Install the latest version of Bazel and log the version.
-(which use_bazel.sh && use_bazel.sh latest) || which bazel
-bazel version
-
-# Switch into the workspace; only necessary if run with kokoro.
-if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
-  cd git/repo
-elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
-  cd github/repo
-fi
-
-# Set the standard bazel flags.
-declare -r BAZEL_FLAGS=(
-  "--show_timestamps"
-  "--test_output=errors"
-  "--keep_going"
-  "--verbose_failures=true"
-)
-if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then
-  declare -r BAZEL_RBE_AUTH_FLAGS=(
-    "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
-  )
-  declare -r BAZEL_RBE_FLAGS=("--config=remote")
-fi
-
-# Wrap bazel.
-function build() {
-  bazel build "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" 2>&1 |
-    tee /dev/fd/2 | grep -E '^  bazel-bin/' | awk '{ print $1; }'
-}
-
-function test() {
-  bazel test "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@"
-}
-
-function run() {
-  local binary=$1
-  shift
-  bazel run "${binary}" -- "$@"
-}
-
-function run_as_root() {
-  local binary=$1
-  shift
-  bazel run --run_under="sudo" "${binary}" -- "$@"
-}
-
-function collect_logs() {
-  # Zip out everything into a convenient form.
-  if [[ -v KOKORO_ARTIFACTS_DIR ]] && [[ -e bazel-testlogs ]]; then
-    # Merge results files of all shards for each test suite.
-    for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do
-      junitparser merge `find $d -name test.xml` $d/test.xml
-      cat $d/shard_*_of_*/test.log > $d/test.log
-      ls -l $d/shard_*_of_*/test.outputs/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/test.outputs/outputs.zip
-    done
-    find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf
-    # Move test logs to Kokoro directory. tar is used to conveniently perform
-    # renames while moving files.
-    find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
-      tar --create --files-from - --transform 's/test\./sponge_log./' |
-      tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
-
-    # Collect sentry logs, if any.
-    if [[ -v RUNSC_LOGS_DIR ]] && [[ -d "${RUNSC_LOGS_DIR}" ]]; then
-      # Check if the directory is empty or not (only the first line it needed).
-      local -r logs=$(ls "${RUNSC_LOGS_DIR}" | head -n1)
-      if [[ "${logs}" ]]; then
-        local -r archive=runsc_logs_"${RUNTIME}".tar.gz
-        if [[ -v KOKORO_BUILD_ARTIFACTS_SUBDIR ]]; then
-          echo "runsc logs will be uploaded to:"
-          echo "    gsutil cp gs://gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive} /tmp"
-          echo "    https://storage.cloud.google.com/gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive}"
-        fi
-        tar --create --gzip --file="${KOKORO_ARTIFACTS_DIR}/${archive}" -C "${RUNSC_LOGS_DIR}" .
-      fi
-    fi
-  fi
-}
-
-function find_branch_name() {
-  git branch --show-current || git rev-parse HEAD || bazel info workspace | xargs basename
-}
diff --git a/scripts/common_build.sh b/scripts/common_build.sh
new file mode 100755
index 000000000..a473a88a4
--- /dev/null
+++ b/scripts/common_build.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Install the latest version of Bazel and log the version.
+(which use_bazel.sh && use_bazel.sh latest) || which bazel
+bazel version
+
+# Switch into the workspace; only necessary if run with kokoro.
+if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
+  cd git/repo
+elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
+  cd github/repo
+fi
+
+# Set the standard bazel flags.
+declare -r BAZEL_FLAGS=(
+  "--show_timestamps"
+  "--test_output=errors"
+  "--keep_going"
+  "--verbose_failures=true"
+)
+if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then
+  declare -r BAZEL_RBE_AUTH_FLAGS=(
+    "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
+  )
+  declare -r BAZEL_RBE_FLAGS=("--config=remote")
+fi
+
+# Wrap bazel.
+function build() {
+  bazel build "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" 2>&1 |
+    tee /dev/fd/2 | grep -E '^  bazel-bin/' | awk '{ print $1; }'
+}
+
+function test() {
+  bazel test "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@"
+}
+
+function run() {
+  local binary=$1
+  shift
+  bazel run "${binary}" -- "$@"
+}
+
+function run_as_root() {
+  local binary=$1
+  shift
+  bazel run --run_under="sudo" "${binary}" -- "$@"
+}
+
+function collect_logs() {
+  # Zip out everything into a convenient form.
+  if [[ -v KOKORO_ARTIFACTS_DIR ]] && [[ -e bazel-testlogs ]]; then
+    # Merge results files of all shards for each test suite.
+    for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do
+      junitparser merge `find $d -name test.xml` $d/test.xml
+      cat $d/shard_*_of_*/test.log > $d/test.log
+      ls -l $d/shard_*_of_*/test.outputs/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/test.outputs/outputs.zip
+    done
+    find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf
+    # Move test logs to Kokoro directory. tar is used to conveniently perform
+    # renames while moving files.
+    find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
+      tar --create --files-from - --transform 's/test\./sponge_log./' |
+      tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
+
+    # Collect sentry logs, if any.
+    if [[ -v RUNSC_LOGS_DIR ]] && [[ -d "${RUNSC_LOGS_DIR}" ]]; then
+      # Check if the directory is empty or not (only the first line it needed).
+      local -r logs=$(ls "${RUNSC_LOGS_DIR}" | head -n1)
+      if [[ "${logs}" ]]; then
+        local -r archive=runsc_logs_"${RUNTIME}".tar.gz
+        if [[ -v KOKORO_BUILD_ARTIFACTS_SUBDIR ]]; then
+          echo "runsc logs will be uploaded to:"
+          echo "    gsutil cp gs://gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive} /tmp"
+          echo "    https://storage.cloud.google.com/gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive}"
+        fi
+        tar --create --gzip --file="${KOKORO_ARTIFACTS_DIR}/${archive}" -C "${RUNSC_LOGS_DIR}" .
+      fi
+    fi
+  fi
+}
+
+function find_branch_name() {
+  git branch --show-current || git rev-parse HEAD || bazel info workspace | xargs basename
+}
diff --git a/test/BUILD b/test/BUILD
index bf834d994..34b950644 100644
--- a/test/BUILD
+++ b/test/BUILD
@@ -1,44 +1 @@
-package(licenses = ["notice"])  # Apache 2.0
-
-# We need to define a bazel platform and toolchain to specify dockerPrivileged
-# and dockerRunAsRoot options, they are required to run tests on the RBE
-# cluster in Kokoro.
-alias(
-    name = "rbe_ubuntu1604",
-    actual = ":rbe_ubuntu1604_r346485",
-)
-
-platform(
-    name = "rbe_ubuntu1604_r346485",
-    constraint_values = [
-        "@bazel_tools//platforms:x86_64",
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//tools/cpp:clang",
-        "@bazel_toolchains//constraints:xenial",
-        "@bazel_toolchains//constraints/sanitizers:support_msan",
-    ],
-    remote_execution_properties = """
-        properties: {
-          name: "container-image"
-          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50"
-        }
-        properties: {
-          name: "dockerAddCapabilities"
-          value: "SYS_ADMIN"
-        }
-        properties: {
-          name: "dockerPrivileged"
-          value: "true"
-        }
-    """,
-)
-
-toolchain(
-    name = "cc-toolchain-clang-x86_64-default",
-    exec_compatible_with = [
-    ],
-    target_compatible_with = [
-    ],
-    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
+package(licenses = ["notice"])
diff --git a/test/e2e/BUILD b/test/e2e/BUILD
index 4fe03a220..76e04f878 100644
--- a/test/e2e/BUILD
+++ b/test/e2e/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -10,7 +10,7 @@ go_test(
         "integration_test.go",
         "regression_test.go",
     ],
-    embed = [":integration"],
+    library = ":integration",
     tags = [
         # Requires docker and runsc to be configured before the test runs.
         "manual",
@@ -29,5 +29,4 @@ go_test(
 go_library(
     name = "integration",
     srcs = ["integration.go"],
-    importpath = "gvisor.dev/gvisor/test/integration",
 )
diff --git a/test/image/BUILD b/test/image/BUILD
index 09b0a0ad5..7392ac54e 100644
--- a/test/image/BUILD
+++ b/test/image/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -14,7 +14,7 @@ go_test(
         "ruby.rb",
         "ruby.sh",
     ],
-    embed = [":image"],
+    library = ":image",
     tags = [
         # Requires docker and runsc to be configured before the test runs.
         "manual",
@@ -30,5 +30,4 @@ go_test(
 go_library(
     name = "image",
     srcs = ["image.go"],
-    importpath = "gvisor.dev/gvisor/test/image",
 )
diff --git a/test/iptables/BUILD b/test/iptables/BUILD
index 22f470092..6bb3b82b5 100644
--- a/test/iptables/BUILD
+++ b/test/iptables/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -12,7 +12,6 @@ go_library(
         "iptables_util.go",
         "nat.go",
     ],
-    importpath = "gvisor.dev/gvisor/test/iptables",
     visibility = ["//test/iptables:__subpackages__"],
     deps = [
         "//runsc/testutil",
@@ -24,7 +23,7 @@ go_test(
     srcs = [
         "iptables_test.go",
     ],
-    embed = [":iptables"],
+    library = ":iptables",
     tags = [
         "local",
         "manual",
diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD
index a5b6f082c..b9199387a 100644
--- a/test/iptables/runner/BUILD
+++ b/test/iptables/runner/BUILD
@@ -1,15 +1,21 @@
-load("@io_bazel_rules_docker//go:image.bzl", "go_image")
-load("@io_bazel_rules_docker//container:container.bzl", "container_image")
+load("//tools:defs.bzl", "container_image", "go_binary", "go_image")
 
 package(licenses = ["notice"])
 
+go_binary(
+    name = "runner",
+    testonly = 1,
+    srcs = ["main.go"],
+    deps = ["//test/iptables"],
+)
+
 container_image(
     name = "iptables-base",
     base = "@iptables-test//image",
 )
 
 go_image(
-    name = "runner",
+    name = "runner-image",
     testonly = 1,
     srcs = ["main.go"],
     base = ":iptables-base",
diff --git a/test/root/BUILD b/test/root/BUILD
index d5dd9bca2..23ce2a70f 100644
--- a/test/root/BUILD
+++ b/test/root/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "root",
     srcs = ["root.go"],
-    importpath = "gvisor.dev/gvisor/test/root",
 )
 
 go_test(
@@ -21,7 +20,7 @@ go_test(
     data = [
         "//runsc",
     ],
-    embed = [":root"],
+    library = ":root",
     tags = [
         # Requires docker and runsc to be configured before the test runs.
         # Also test only runs as root.
diff --git a/test/root/testdata/BUILD b/test/root/testdata/BUILD
index 125633680..bca5f9cab 100644
--- a/test/root/testdata/BUILD
+++ b/test/root/testdata/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -12,7 +12,6 @@ go_library(
         "sandbox.go",
         "simple.go",
     ],
-    importpath = "gvisor.dev/gvisor/test/root/testdata",
     visibility = [
         "//visibility:public",
     ],
diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD
index 367295206..2c472bf8d 100644
--- a/test/runtimes/BUILD
+++ b/test/runtimes/BUILD
@@ -1,6 +1,6 @@
 # These packages are used to run language runtime tests inside gVisor sandboxes.
 
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test")
+load("//tools:defs.bzl", "go_binary", "go_test")
 load("//test/runtimes:build_defs.bzl", "runtime_test")
 
 package(licenses = ["notice"])
@@ -49,5 +49,5 @@ go_test(
     name = "blacklist_test",
     size = "small",
     srcs = ["blacklist_test.go"],
-    embed = [":runner"],
+    library = ":runner",
 )
diff --git a/test/runtimes/build_defs.bzl b/test/runtimes/build_defs.bzl
index 6f84ca852..92e275a76 100644
--- a/test/runtimes/build_defs.bzl
+++ b/test/runtimes/build_defs.bzl
@@ -1,6 +1,6 @@
 """Defines a rule for runtime test targets."""
 
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_test", "loopback")
 
 def runtime_test(
         name,
@@ -34,6 +34,7 @@ def runtime_test(
     ]
     data = [
         ":runner",
+        loopback,
     ]
     if blacklist_file:
         args += ["--blacklist_file", "test/runtimes/" + blacklist_file]
@@ -61,7 +62,7 @@ def blacklist_test(name, blacklist_file):
     """Test that a blacklist parses correctly."""
     go_test(
         name = name + "_blacklist_test",
-        embed = [":runner"],
+        library = ":runner",
         srcs = ["blacklist_test.go"],
         args = ["--blacklist_file", "test/runtimes/" + blacklist_file],
         data = [blacklist_file],
diff --git a/test/runtimes/images/proctor/BUILD b/test/runtimes/images/proctor/BUILD
index 09dc6c42f..85e004c45 100644
--- a/test/runtimes/images/proctor/BUILD
+++ b/test/runtimes/images/proctor/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test")
+load("//tools:defs.bzl", "go_binary", "go_test")
 
 package(licenses = ["notice"])
 
@@ -19,7 +19,7 @@ go_test(
     name = "proctor_test",
     size = "small",
     srcs = ["proctor_test.go"],
-    embed = [":proctor"],
+    library = ":proctor",
     deps = [
         "//runsc/testutil",
     ],
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 90d52e73b..40e974314 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 load("//test/syscalls:build_defs.bzl", "syscall_test")
 
 package(licenses = ["notice"])
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index aaf77c65b..1df761dd0 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -1,5 +1,7 @@
 """Defines a rule for syscall test targets."""
 
+load("//tools:defs.bzl", "loopback")
+
 # syscall_test is a macro that will create targets to run the given test target
 # on the host (native) and runsc.
 def syscall_test(
@@ -135,6 +137,7 @@ def _syscall_test(
         name = name,
         data = [
             ":syscall_test_runner",
+            loopback,
             test,
         ],
         args = args,
@@ -148,6 +151,3 @@ def sh_test(**kwargs):
     native.sh_test(
         **kwargs
     )
-
-def select_for_linux(for_linux, for_others = []):
-    return for_linux
diff --git a/test/syscalls/gtest/BUILD b/test/syscalls/gtest/BUILD
index 9293f25cb..de4b2727c 100644
--- a/test/syscalls/gtest/BUILD
+++ b/test/syscalls/gtest/BUILD
@@ -1,12 +1,9 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "gtest",
     srcs = ["gtest.go"],
-    importpath = "gvisor.dev/gvisor/test/syscalls/gtest",
-    visibility = [
-        "//test:__subpackages__",
-    ],
+    visibility = ["//:sandbox"],
 )
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 4c7ec3f06..c2ef50c1d 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1,5 +1,4 @@
-load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
-load("//test/syscalls:build_defs.bzl", "select_for_linux")
+load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -126,13 +125,11 @@ cc_library(
     testonly = 1,
     srcs = [
         "socket_test_util.cc",
-    ] + select_for_linux(
-        [
-            "socket_test_util_impl.cc",
-        ],
-    ),
+        "socket_test_util_impl.cc",
+    ],
     hdrs = ["socket_test_util.h"],
-    deps = [
+    defines = select_system(),
+    deps = default_net_util() + [
         "@com_google_googletest//:gtest",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -143,8 +140,7 @@ cc_library(
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:thread_util",
-    ] + select_for_linux([
-    ]),
+    ],
 )
 
 cc_library(
@@ -1443,6 +1439,7 @@ cc_binary(
     srcs = ["arch_prctl.cc"],
     linkstatic = 1,
     deps = [
+        "//test/util:file_descriptor",
         "//test/util:test_main",
         "//test/util:test_util",
         "@com_google_googletest//:gtest",
@@ -3383,11 +3380,11 @@ cc_library(
     name = "udp_socket_test_cases",
     testonly = 1,
     srcs = [
-        "udp_socket_test_cases.cc",
-    ] + select_for_linux([
         "udp_socket_errqueue_test_case.cc",
-    ]),
+        "udp_socket_test_cases.cc",
+    ],
     hdrs = ["udp_socket_test_cases.h"],
+    defines = select_system(),
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
diff --git a/test/syscalls/linux/arch_prctl.cc b/test/syscalls/linux/arch_prctl.cc
index 81bf5a775..3a901faf5 100644
--- a/test/syscalls/linux/arch_prctl.cc
+++ b/test/syscalls/linux/arch_prctl.cc
@@ -14,8 +14,10 @@
 
 #include <asm/prctl.h>
 #include <sys/prctl.h>
+#include <sys/syscall.h>
 
 #include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
 #include "test/util/test_util.h"
 
 // glibc does not provide a prototype for arch_prctl() so declare it here.
diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD
index 5cfe4e56f..ed488dbc2 100644
--- a/test/syscalls/linux/rseq/BUILD
+++ b/test/syscalls/linux/rseq/BUILD
@@ -1,8 +1,7 @@
 # This package contains a standalone rseq test binary. This binary must not
 # depend on libc, which might use rseq itself.
 
-load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier")
-load("@rules_cc//cc:defs.bzl", "cc_library")
+load("//tools:defs.bzl", "cc_flags_supplier", "cc_library", "cc_toolchain")
 
 package(licenses = ["notice"])
 
@@ -37,8 +36,8 @@ genrule(
         "$(location start.S)",
     ]),
     toolchains = [
+        cc_toolchain,
         ":no_pie_cc_flags",
-        "@bazel_tools//tools/cpp:current_cc_toolchain",
     ],
     visibility = ["//:sandbox"],
 )
diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
index 147978f46..9a24e1df0 100644
--- a/test/syscalls/linux/udp_socket_errqueue_test_case.cc
+++ b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef __fuchsia__
+
 #include "test/syscalls/linux/udp_socket_test_cases.h"
 
 #include <arpa/inet.h>
@@ -52,3 +54,5 @@ TEST_P(UdpSocketTest, ErrorQueue) {
 
 }  // namespace testing
 }  // namespace gvisor
+
+#endif  // __fuchsia__
diff --git a/test/uds/BUILD b/test/uds/BUILD
index a3843e699..51e2c7ce8 100644
--- a/test/uds/BUILD
+++ b/test/uds/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -9,7 +9,6 @@ go_library(
     name = "uds",
     testonly = 1,
     srcs = ["uds.go"],
-    importpath = "gvisor.dev/gvisor/test/uds",
     deps = [
         "//pkg/log",
         "//pkg/unet",
diff --git a/test/util/BUILD b/test/util/BUILD
index cbc728159..3c732be62 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,5 +1,4 @@
-load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
-load("//test/syscalls:build_defs.bzl", "select_for_linux")
+load("//tools:defs.bzl", "cc_library", "cc_test", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -142,12 +141,13 @@ cc_library(
 cc_library(
     name = "save_util",
     testonly = 1,
-    srcs = ["save_util.cc"] +
-           select_for_linux(
-               ["save_util_linux.cc"],
-               ["save_util_other.cc"],
-           ),
+    srcs = [
+        "save_util.cc",
+        "save_util_linux.cc",
+        "save_util_other.cc",
+    ],
     hdrs = ["save_util.h"],
+    defines = select_system(),
 )
 
 cc_library(
@@ -234,13 +234,16 @@ cc_library(
     testonly = 1,
     srcs = [
         "test_util.cc",
-    ] + select_for_linux(
-        [
-            "test_util_impl.cc",
-            "test_util_runfiles.cc",
+        "test_util_impl.cc",
+        "test_util_runfiles.cc",
+    ],
+    hdrs = ["test_util.h"],
+    defines = select_system(
+        fuchsia = [
+            "__opensource__",
+            "__fuchsia__",
         ],
     ),
-    hdrs = ["test_util.h"],
     deps = [
         ":fs_util",
         ":logging",
diff --git a/test/util/save_util_linux.cc b/test/util/save_util_linux.cc
index cd56118c0..d0aea8e6a 100644
--- a/test/util/save_util_linux.cc
+++ b/test/util/save_util_linux.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __linux__
+
 #include <errno.h>
 #include <sys/syscall.h>
 #include <unistd.h>
@@ -43,3 +45,5 @@ void MaybeSave() {
 
 }  // namespace testing
 }  // namespace gvisor
+
+#endif
diff --git a/test/util/save_util_other.cc b/test/util/save_util_other.cc
index 1aca663b7..931af2c29 100644
--- a/test/util/save_util_other.cc
+++ b/test/util/save_util_other.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef __linux__
+
 namespace gvisor {
 namespace testing {
 
@@ -21,3 +23,5 @@ void MaybeSave() {
 
 }  // namespace testing
 }  // namespace gvisor
+
+#endif
diff --git a/test/util/test_util_runfiles.cc b/test/util/test_util_runfiles.cc
index 7210094eb..694d21692 100644
--- a/test/util/test_util_runfiles.cc
+++ b/test/util/test_util_runfiles.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef __fuchsia__
+
 #include <iostream>
 #include <string>
 
@@ -44,3 +46,5 @@ std::string RunfilePath(std::string path) {
 
 }  // namespace testing
 }  // namespace gvisor
+
+#endif  // __fuchsia__
diff --git a/tools/BUILD b/tools/BUILD
new file mode 100644
index 000000000..e73a9c885
--- /dev/null
+++ b/tools/BUILD
@@ -0,0 +1,3 @@
+package(licenses = ["notice"])
+
+exports_files(["nogo.js"])
diff --git a/tools/build/BUILD b/tools/build/BUILD
new file mode 100644
index 000000000..0c0ce3f4d
--- /dev/null
+++ b/tools/build/BUILD
@@ -0,0 +1,10 @@
+package(licenses = ["notice"])
+
+# In bazel, no special support is required for loopback networking. This is
+# just a dummy data target that does not change the test environment.
+genrule(
+    name = "loopback",
+    outs = ["loopback.txt"],
+    cmd = "touch $@",
+    visibility = ["//visibility:public"],
+)
diff --git a/tools/build/defs.bzl b/tools/build/defs.bzl
new file mode 100644
index 000000000..d0556abd1
--- /dev/null
+++ b/tools/build/defs.bzl
@@ -0,0 +1,91 @@
+"""Bazel implementations of standard rules."""
+
+load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier")
+load("@io_bazel_rules_go//go:def.bzl", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_test = "go_test", _go_tool_library = "go_tool_library")
+load("@io_bazel_rules_go//proto:def.bzl", _go_proto_library = "go_proto_library")
+load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test")
+load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar")
+load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image")
+load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image")
+load("@pydeps//:requirements.bzl", _py_requirement = "requirement")
+
+container_image = _container_image
+cc_binary = _cc_binary
+cc_library = _cc_library
+cc_flags_supplier = _cc_flags_supplier
+cc_proto_library = _cc_proto_library
+cc_test = _cc_test
+cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
+go_image = _go_image
+go_embed_data = _go_embed_data
+loopback = "//tools/build:loopback"
+proto_library = native.proto_library
+pkg_deb = _pkg_deb
+pkg_tar = _pkg_tar
+py_library = native.py_library
+py_binary = native.py_binary
+py_test = native.py_test
+
+def go_binary(name, static = False, pure = False, **kwargs):
+    if static:
+        kwargs["static"] = "on"
+    if pure:
+        kwargs["pure"] = "on"
+    _go_binary(
+        name = name,
+        **kwargs
+    )
+
+def go_library(name, **kwargs):
+    _go_library(
+        name = name,
+        importpath = "gvisor.dev/gvisor/" + native.package_name(),
+        **kwargs
+    )
+
+def go_tool_library(name, **kwargs):
+    _go_tool_library(
+        name = name,
+        importpath = "gvisor.dev/gvisor/" + native.package_name(),
+        **kwargs
+    )
+
+def go_proto_library(name, proto, **kwargs):
+    deps = kwargs.pop("deps", [])
+    _go_proto_library(
+        name = name,
+        importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name,
+        proto = proto,
+        deps = [dep.replace("_proto", "_go_proto") for dep in deps],
+        **kwargs
+    )
+
+def go_test(name, **kwargs):
+    library = kwargs.pop("library", None)
+    if library:
+        kwargs["embed"] = [library]
+    _go_test(
+        name = name,
+        **kwargs
+    )
+
+def py_requirement(name, direct = False):
+    return _py_requirement(name)
+
+def select_arch(amd64 = "amd64", arm64 = "arm64", default = None, **kwargs):
+    values = {
+        "@bazel_tools//src/conditions:linux_x86_64": amd64,
+        "@bazel_tools//src/conditions:linux_aarch64": arm64,
+    }
+    if default:
+        values["//conditions:default"] = default
+    return select(values, **kwargs)
+
+def select_system(linux = ["__linux__"], **kwargs):
+    return linux  # Only Linux supported.
+
+def default_installer():
+    return None
+
+def default_net_util():
+    return []  # Nothing needed.
diff --git a/tools/checkunsafe/BUILD b/tools/checkunsafe/BUILD
index d85c56131..92ba8ab06 100644
--- a/tools/checkunsafe/BUILD
+++ b/tools/checkunsafe/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_tool_library")
+load("//tools:defs.bzl", "go_tool_library")
 
 package(licenses = ["notice"])
 
 go_tool_library(
     name = "checkunsafe",
     srcs = ["check_unsafe.go"],
-    importpath = "checkunsafe",
     visibility = ["//visibility:public"],
     deps = [
         "@org_golang_x_tools//go/analysis:go_tool_library",
diff --git a/tools/defs.bzl b/tools/defs.bzl
new file mode 100644
index 000000000..819f12b0d
--- /dev/null
+++ b/tools/defs.bzl
@@ -0,0 +1,154 @@
+"""Wrappers for common build rules.
+
+These wrappers apply common BUILD configurations (e.g., proto_library
+automagically creating cc_ and go_ proto targets) and act as a single point of
+change for Google-internal and bazel-compatible rules.
+"""
+
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
+load("//tools/build:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+
+# Delegate directly.
+cc_binary = _cc_binary
+cc_library = _cc_library
+cc_test = _cc_test
+cc_toolchain = _cc_toolchain
+cc_flags_supplier = _cc_flags_supplier
+container_image = _container_image
+go_embed_data = _go_embed_data
+go_image = _go_image
+go_test = _go_test
+go_tool_library = _go_tool_library
+pkg_deb = _pkg_deb
+pkg_tar = _pkg_tar
+py_library = _py_library
+py_binary = _py_binary
+py_test = _py_test
+py_requirement = _py_requirement
+select_arch = _select_arch
+select_system = _select_system
+loopback = _loopback
+default_installer = _default_installer
+default_net_util = _default_net_util
+
+def go_binary(name, **kwargs):
+    """Wraps the standard go_binary.
+
+    Args:
+      name: the rule name.
+      **kwargs: standard go_binary arguments.
+    """
+    _go_binary(
+        name = name,
+        **kwargs
+    )
+
+def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs):
+    """Wraps the standard go_library and does stateification and marshalling.
+
+    The recommended way is to use this rule with mostly identical configuration as the native
+    go_library rule.
+
+    These definitions provide additional flags (stateify, marshal) that can be used
+    with the generators to automatically supplement the library code.
+
+    load("//tools:defs.bzl", "go_library")
+
+    go_library(
+        name = "foo",
+        srcs = ["foo.go"],
+    )
+
+    Args:
+      name: the rule name.
+      srcs: the library sources.
+      deps: the library dependencies.
+      imports: imports required for stateify.
+      stateify: whether statify is enabled (default: true).
+      marshal: whether marshal is enabled (default: false).
+      **kwargs: standard go_library arguments.
+    """
+    if stateify:
+        # Only do stateification for non-state packages without manual autogen.
+        go_stateify(
+            name = name + "_state_autogen",
+            srcs = [src for src in srcs if src.endswith(".go")],
+            imports = imports,
+            package = name,
+            arch = select_arch(),
+            out = name + "_state_autogen.go",
+        )
+        all_srcs = srcs + [name + "_state_autogen.go"]
+        if "//pkg/state" not in deps:
+            all_deps = deps + ["//pkg/state"]
+        else:
+            all_deps = deps
+    else:
+        all_deps = deps
+        all_srcs = srcs
+    if marshal:
+        go_marshal(
+            name = name + "_abi_autogen",
+            srcs = [src for src in srcs if src.endswith(".go")],
+            debug = False,
+            imports = imports,
+            package = name,
+        )
+        extra_deps = [
+            dep
+            for dep in marshal_deps
+            if not dep in all_deps
+        ]
+        all_deps = all_deps + extra_deps
+        all_srcs = srcs + [name + "_abi_autogen_unsafe.go"]
+
+    _go_library(
+        name = name,
+        srcs = all_srcs,
+        deps = all_deps,
+        **kwargs
+    )
+
+    if marshal:
+        # Ignore importpath for go_test.
+        kwargs.pop("importpath", None)
+
+        _go_test(
+            name = name + "_abi_autogen_test",
+            srcs = [name + "_abi_autogen_test.go"],
+            library = ":" + name,
+            deps = marshal_test_deps,
+            **kwargs
+        )
+
+def proto_library(name, srcs, **kwargs):
+    """Wraps the standard proto_library.
+
+    Given a proto_library named "foo", this produces three different targets:
+    - foo_proto: proto_library rule.
+    - foo_go_proto: go_proto_library rule.
+    - foo_cc_proto: cc_proto_library rule.
+
+    Args:
+      srcs: the proto sources.
+      **kwargs: standard proto_library arguments.
+    """
+    deps = kwargs.pop("deps", [])
+    _proto_library(
+        name = name + "_proto",
+        srcs = srcs,
+        deps = deps,
+        **kwargs
+    )
+    _go_proto_library(
+        name = name + "_go_proto",
+        proto = ":" + name + "_proto",
+        deps = deps,
+        **kwargs
+    )
+    _cc_proto_library(
+        name = name + "_cc_proto",
+        deps = [":" + name + "_proto"],
+        **kwargs
+    )
diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD
index 39318b877..069df3856 100644
--- a/tools/go_generics/BUILD
+++ b/tools/go_generics/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/tools/go_generics/globals/BUILD b/tools/go_generics/globals/BUILD
index 74853c7d2..38caa3ce7 100644
--- a/tools/go_generics/globals/BUILD
+++ b/tools/go_generics/globals/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -8,6 +8,6 @@ go_library(
         "globals_visitor.go",
         "scope.go",
     ],
-    importpath = "gvisor.dev/gvisor/tools/go_generics/globals",
+    stateify = False,
     visibility = ["//tools/go_generics:__pkg__"],
 )
diff --git a/tools/go_generics/go_merge/BUILD b/tools/go_generics/go_merge/BUILD
index 02b09120e..b7d35e272 100644
--- a/tools/go_generics/go_merge/BUILD
+++ b/tools/go_generics/go_merge/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/tools/go_generics/rules_tests/BUILD b/tools/go_generics/rules_tests/BUILD
index 9d26a88b7..8a329dfc6 100644
--- a/tools/go_generics/rules_tests/BUILD
+++ b/tools/go_generics/rules_tests/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 package(licenses = ["notice"])
diff --git a/tools/go_marshal/BUILD b/tools/go_marshal/BUILD
index c862b277c..80d9c0504 100644
--- a/tools/go_marshal/BUILD
+++ b/tools/go_marshal/BUILD
@@ -1,6 +1,6 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_binary(
     name = "go_marshal",
diff --git a/tools/go_marshal/README.md b/tools/go_marshal/README.md
index 481575bd3..4886efddf 100644
--- a/tools/go_marshal/README.md
+++ b/tools/go_marshal/README.md
@@ -20,19 +20,7 @@ comment `// +marshal`.
 
 # Usage
 
-See `defs.bzl`: two new rules are provided, `go_marshal` and `go_library`.
-
-The recommended way to generate a go library with marshalling is to use the
-`go_library` with mostly identical configuration as the native go_library rule.
-
-```
-load("<PKGPATH>/gvisor/tools/go_marshal:defs.bzl", "go_library")
-
-go_library(
-    name = "foo",
-    srcs = ["foo.go"],
-)
-```
+See `defs.bzl`: a new rule is provided, `go_marshal`.
 
 Under the hood, the `go_marshal` rule is used to generate a file that will
 appear in a Go target; the output file should appear explicitly in a srcs list.
@@ -54,11 +42,7 @@ go_library(
         "foo.go",
         "foo_abi.go",
     ],
-    deps = [
-        "<PKGPATH>/gvisor/pkg/abi",
-        "<PKGPATH>/gvisor/pkg/sentry/safemem/safemem",
-        "<PKGPATH>/gvisor/pkg/sentry/usermem/usermem",
-    ],
+    ...
 )
 ```
 
@@ -69,22 +53,6 @@ These tests use reflection to verify properties of the ABI struct, and should be
 considered part of the generated interfaces (but are too expensive to execute at
 runtime). Ensure these tests run at some point.
 
-```
-$ cat BUILD
-load("<PKGPATH>/gvisor/tools/go_marshal:defs.bzl", "go_library")
-
-go_library(
-    name = "foo",
-    srcs = ["foo.go"],
-)
-$ blaze build :foo
-$ blaze query ...
-<path-to-dir>:foo_abi_autogen
-<path-to-dir>:foo_abi_autogen_test
-$ blaze test :foo_abi_autogen_test
-<test-output>
-```
-
 # Restrictions
 
 Not all valid go type definitions can be used with `go_marshal`. `go_marshal` is
@@ -131,22 +99,6 @@ for embedded structs that are not aligned.
 Because of this, it's generally best to avoid using `marshal:"unaligned"` and
 insert explicit padding fields instead.
 
-## Debugging go_marshal
-
-To enable debugging output from the go marshal tool, pass the `-debug` flag to
-the tool. When using the build rules from above, add a `debug = True` field to
-the build rule like this:
-
-```
-load("<PKGPATH>/gvisor/tools/go_marshal:defs.bzl", "go_library")
-
-go_library(
-    name = "foo",
-    srcs = ["foo.go"],
-    debug = True,
-)
-```
-
 ## Modifying the `go_marshal` Tool
 
 The following are some guidelines for modifying the `go_marshal` tool:
diff --git a/tools/go_marshal/analysis/BUILD b/tools/go_marshal/analysis/BUILD
index c859ced77..c2a4d45c4 100644
--- a/tools/go_marshal/analysis/BUILD
+++ b/tools/go_marshal/analysis/BUILD
@@ -1,12 +1,11 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "analysis",
     testonly = 1,
     srcs = ["analysis_unsafe.go"],
-    importpath = "gvisor.dev/gvisor/tools/go_marshal/analysis",
     visibility = [
         "//:sandbox",
     ],
diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl
index c32eb559f..2918ceffe 100644
--- a/tools/go_marshal/defs.bzl
+++ b/tools/go_marshal/defs.bzl
@@ -1,57 +1,14 @@
-"""Marshal is a tool for generating marshalling interfaces for Go types.
-
-The recommended way is to use the go_library rule defined below with mostly
-identical configuration as the native go_library rule.
-
-load("//tools/go_marshal:defs.bzl", "go_library")
-
-go_library(
-    name = "foo",
-    srcs = ["foo.go"],
-)
-
-Under the hood, the go_marshal rule is used to generate a file that will
-appear in a Go target; the output file should appear explicitly in a srcs list.
-For example (the above is still the preferred way):
-
-load("//tools/go_marshal:defs.bzl", "go_marshal")
-
-go_marshal(
-    name = "foo_abi",
-    srcs = ["foo.go"],
-    out = "foo_abi.go",
-    package = "foo",
-)
-
-go_library(
-    name = "foo",
-    srcs = [
-        "foo.go",
-        "foo_abi.go",
-    ],
-    deps = [
-       "//tools/go_marshal:marshal",
-       "//pkg/sentry/platform/safecopy",
-       "//pkg/sentry/usermem",
-    ],
-)
-"""
-
-load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library", _go_test = "go_test")
+"""Marshal is a tool for generating marshalling interfaces for Go types."""
 
 def _go_marshal_impl(ctx):
     """Execute the go_marshal tool."""
     output = ctx.outputs.lib
     output_test = ctx.outputs.test
-    (build_dir, _, _) = ctx.build_file_path.rpartition("/BUILD")
-
-    decl = "/".join(["gvisor.dev/gvisor", build_dir])
 
     # Run the marshal command.
     args = ["-output=%s" % output.path]
     args += ["-pkg=%s" % ctx.attr.package]
     args += ["-output_test=%s" % output_test.path]
-    args += ["-declarationPkg=%s" % decl]
 
     if ctx.attr.debug:
         args += ["-debug"]
@@ -83,7 +40,6 @@ go_marshal = rule(
     implementation = _go_marshal_impl,
     attrs = {
         "srcs": attr.label_list(mandatory = True, allow_files = True),
-        "libname": attr.string(mandatory = True),
         "imports": attr.string_list(mandatory = False),
         "package": attr.string(mandatory = True),
         "debug": attr.bool(doc = "enable debugging output from the go_marshal tool"),
@@ -95,58 +51,14 @@ go_marshal = rule(
     },
 )
 
-def go_library(name, srcs, deps = [], imports = [], debug = False, **kwargs):
-    """wraps the standard go_library and does mashalling interface generation.
-
-    Args:
-      name: Same as native go_library.
-      srcs: Same as native go_library.
-      deps: Same as native go_library.
-      imports: Extra import paths to pass to the go_marshal tool.
-      debug: Enables debugging output from the go_marshal tool.
-      **kwargs: Remaining args to pass to the native go_library rule unmodified.
-    """
-    go_marshal(
-        name = name + "_abi_autogen",
-        libname = name,
-        srcs = [src for src in srcs if src.endswith(".go")],
-        debug = debug,
-        imports = imports,
-        package = name,
-    )
-
-    extra_deps = [
-        "//tools/go_marshal/marshal",
-        "//pkg/sentry/platform/safecopy",
-        "//pkg/sentry/usermem",
-    ]
-
-    all_srcs = srcs + [name + "_abi_autogen_unsafe.go"]
-    all_deps = deps + []  #  + extra_deps
-
-    for extra in extra_deps:
-        if extra not in deps:
-            all_deps.append(extra)
-
-    _go_library(
-        name = name,
-        srcs = all_srcs,
-        deps = all_deps,
-        **kwargs
-    )
-
-    # Don't pass importpath arg to go_test.
-    kwargs.pop("importpath", "")
-
-    _go_test(
-        name = name + "_abi_autogen_test",
-        srcs = [name + "_abi_autogen_test.go"],
-        # Generated test has a fixed set of dependencies since we generate these
-        # tests. They should only depend on the library generated above, and the
-        # Marshallable interface.
-        deps = [
-            ":" + name,
-            "//tools/go_marshal/analysis",
-        ],
-        **kwargs
-    )
+# marshal_deps are the dependencies requied by generated code.
+marshal_deps = [
+    "//tools/go_marshal/marshal",
+    "//pkg/sentry/platform/safecopy",
+    "//pkg/sentry/usermem",
+]
+
+# marshal_test_deps are required by test targets.
+marshal_test_deps = [
+    "//tools/go_marshal/analysis",
+]
diff --git a/tools/go_marshal/gomarshal/BUILD b/tools/go_marshal/gomarshal/BUILD
index a0eae6492..c92b59dd6 100644
--- a/tools/go_marshal/gomarshal/BUILD
+++ b/tools/go_marshal/gomarshal/BUILD
@@ -1,6 +1,6 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "gomarshal",
@@ -10,7 +10,7 @@ go_library(
         "generator_tests.go",
         "util.go",
     ],
-    importpath = "gvisor.dev/gvisor/tools/go_marshal/gomarshal",
+    stateify = False,
     visibility = [
         "//:sandbox",
     ],
diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go
index 641ccd938..8392f3f6d 100644
--- a/tools/go_marshal/gomarshal/generator.go
+++ b/tools/go_marshal/gomarshal/generator.go
@@ -62,15 +62,12 @@ type Generator struct {
 	outputTest *os.File
 	// Package name for the generated file.
 	pkg string
-	// Go import path for package we're processing. This package should directly
-	// declare the type we're generating code for.
-	declaration string
 	// Set of extra packages to import in the generated file.
 	imports *importTable
 }
 
 // NewGenerator creates a new code Generator.
-func NewGenerator(srcs []string, out, outTest, pkg, declaration string, imports []string) (*Generator, error) {
+func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*Generator, error) {
 	f, err := os.OpenFile(out, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
 	if err != nil {
 		return nil, fmt.Errorf("Couldn't open output file %q: %v", out, err)
@@ -80,12 +77,11 @@ func NewGenerator(srcs []string, out, outTest, pkg, declaration string, imports
 		return nil, fmt.Errorf("Couldn't open test output file %q: %v", out, err)
 	}
 	g := Generator{
-		inputs:      srcs,
-		output:      f,
-		outputTest:  fTest,
-		pkg:         pkg,
-		declaration: declaration,
-		imports:     newImportTable(),
+		inputs:     srcs,
+		output:     f,
+		outputTest: fTest,
+		pkg:        pkg,
+		imports:    newImportTable(),
 	}
 	for _, i := range imports {
 		// All imports on the extra imports list are unconditionally marked as
@@ -264,7 +260,7 @@ func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interface
 // generateOneTestSuite generates a test suite for the automatically generated
 // implementations type t.
 func (g *Generator) generateOneTestSuite(t *ast.TypeSpec) *testGenerator {
-	i := newTestGenerator(t, g.declaration)
+	i := newTestGenerator(t)
 	i.emitTests()
 	return i
 }
@@ -359,7 +355,7 @@ func (g *Generator) Run() error {
 // source file.
 func (g *Generator) writeTests(ts []*testGenerator) error {
 	var b sourceBuffer
-	b.emit("package %s_test\n\n", g.pkg)
+	b.emit("package %s\n\n", g.pkg)
 	if err := b.write(g.outputTest); err != nil {
 		return err
 	}
diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go
index df25cb5b2..bcda17c3b 100644
--- a/tools/go_marshal/gomarshal/generator_tests.go
+++ b/tools/go_marshal/gomarshal/generator_tests.go
@@ -46,7 +46,7 @@ type testGenerator struct {
 	decl *importStmt
 }
 
-func newTestGenerator(t *ast.TypeSpec, declaration string) *testGenerator {
+func newTestGenerator(t *ast.TypeSpec) *testGenerator {
 	if _, ok := t.Type.(*ast.StructType); !ok {
 		panic(fmt.Sprintf("Attempting to generate code for a not struct type %v", t))
 	}
@@ -59,14 +59,12 @@ func newTestGenerator(t *ast.TypeSpec, declaration string) *testGenerator {
 	for _, i := range standardImports {
 		g.imports.add(i).markUsed()
 	}
-	g.decl = g.imports.add(declaration)
-	g.decl.markUsed()
 
 	return g
 }
 
 func (g *testGenerator) typeName() string {
-	return fmt.Sprintf("%s.%s", g.decl.name, g.t.Name.Name)
+	return g.t.Name.Name
 }
 
 func (g *testGenerator) forEachField(fn func(f *ast.Field)) {
diff --git a/tools/go_marshal/main.go b/tools/go_marshal/main.go
index 3d12eb93c..e1a97b311 100644
--- a/tools/go_marshal/main.go
+++ b/tools/go_marshal/main.go
@@ -31,11 +31,10 @@ import (
 )
 
 var (
-	pkg            = flag.String("pkg", "", "output package")
-	output         = flag.String("output", "", "output file")
-	outputTest     = flag.String("output_test", "", "output file for tests")
-	imports        = flag.String("imports", "", "comma-separated list of extra packages to import in generated code")
-	declarationPkg = flag.String("declarationPkg", "", "import path of target declaring the types we're generating on")
+	pkg        = flag.String("pkg", "", "output package")
+	output     = flag.String("output", "", "output file")
+	outputTest = flag.String("output_test", "", "output file for tests")
+	imports    = flag.String("imports", "", "comma-separated list of extra packages to import in generated code")
 )
 
 func main() {
@@ -62,7 +61,7 @@ func main() {
 		// as an import.
 		extraImports = strings.Split(*imports, ",")
 	}
-	g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *pkg, *declarationPkg, extraImports)
+	g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *pkg, extraImports)
 	if err != nil {
 		panic(err)
 	}
diff --git a/tools/go_marshal/marshal/BUILD b/tools/go_marshal/marshal/BUILD
index 47dda97a1..ad508c72f 100644
--- a/tools/go_marshal/marshal/BUILD
+++ b/tools/go_marshal/marshal/BUILD
@@ -1,13 +1,12 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "marshal",
     srcs = [
         "marshal.go",
     ],
-    importpath = "gvisor.dev/gvisor/tools/go_marshal/marshal",
     visibility = [
         "//:sandbox",
     ],
diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD
index d412e1ccf..38ba49fed 100644
--- a/tools/go_marshal/test/BUILD
+++ b/tools/go_marshal/test/BUILD
@@ -1,7 +1,6 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_marshal:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 package_group(
     name = "gomarshal_test",
@@ -25,6 +24,6 @@ go_library(
     name = "test",
     testonly = 1,
     srcs = ["test.go"],
-    importpath = "gvisor.dev/gvisor/tools/go_marshal/test",
+    marshal = True,
     deps = ["//tools/go_marshal/test/external"],
 )
diff --git a/tools/go_marshal/test/external/BUILD b/tools/go_marshal/test/external/BUILD
index 9bb89e1da..0cf6da603 100644
--- a/tools/go_marshal/test/external/BUILD
+++ b/tools/go_marshal/test/external/BUILD
@@ -1,11 +1,11 @@
-load("//tools/go_marshal:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "external",
     testonly = 1,
     srcs = ["external.go"],
-    importpath = "gvisor.dev/gvisor/tools/go_marshal/test/external",
+    marshal = True,
     visibility = ["//tools/go_marshal/test:gomarshal_test"],
 )
diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD
index bb53f8ae9..a133d6f8b 100644
--- a/tools/go_stateify/BUILD
+++ b/tools/go_stateify/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl
index 33267c074..0f261d89f 100644
--- a/tools/go_stateify/defs.bzl
+++ b/tools/go_stateify/defs.bzl
@@ -1,41 +1,4 @@
-"""Stateify is a tool for generating state wrappers for Go types.
-
-The recommended way is to use the go_library rule defined below with mostly
-identical configuration as the native go_library rule.
-
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-go_library(
-    name = "foo",
-    srcs = ["foo.go"],
-)
-
-Under the hood, the go_stateify rule is used to generate a file that will
-appear in a Go target; the output file should appear explicitly in a srcs list.
-For example (the above is still the preferred way):
-
-load("//tools/go_stateify:defs.bzl", "go_stateify")
-
-go_stateify(
-    name = "foo_state",
-    srcs = ["foo.go"],
-    out = "foo_state.go",
-    package = "foo",
-)
-
-go_library(
-    name = "foo",
-    srcs = [
-        "foo.go",
-        "foo_state.go",
-    ],
-    deps = [
-        "//pkg/state",
-    ],
-)
-"""
-
-load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library")
+"""Stateify is a tool for generating state wrappers for Go types."""
 
 def _go_stateify_impl(ctx):
     """Implementation for the stateify tool."""
@@ -103,43 +66,3 @@ files and must be added to the srcs of the relevant go_library.
         "_statepkg": attr.string(default = "gvisor.dev/gvisor/pkg/state"),
     },
 )
-
-def go_library(name, srcs, deps = [], imports = [], **kwargs):
-    """Standard go_library wrapped which generates state source files.
-
-    Args:
-      name: the name of the go_library rule.
-      srcs: sources of the go_library. Each will be processed for stateify
-            annotations.
-      deps: dependencies for the go_library.
-      imports: an optional list of extra non-aliased, Go-style absolute import
-               paths required for stateified types.
-      **kwargs: passed to go_library.
-    """
-    if "encode_unsafe.go" not in srcs and (name + "_state_autogen.go") not in srcs:
-        # Only do stateification for non-state packages without manual autogen.
-        go_stateify(
-            name = name + "_state_autogen",
-            srcs = [src for src in srcs if src.endswith(".go")],
-            imports = imports,
-            package = name,
-            arch = select({
-                "@bazel_tools//src/conditions:linux_aarch64": "arm64",
-                "//conditions:default": "amd64",
-            }),
-            out = name + "_state_autogen.go",
-        )
-        all_srcs = srcs + [name + "_state_autogen.go"]
-        if "//pkg/state" not in deps:
-            all_deps = deps + ["//pkg/state"]
-        else:
-            all_deps = deps
-    else:
-        all_deps = deps
-        all_srcs = srcs
-    _go_library(
-        name = name,
-        srcs = all_srcs,
-        deps = all_deps,
-        **kwargs
-    )
diff --git a/tools/images/BUILD b/tools/images/BUILD
index 2b77c2737..f1699b184 100644
--- a/tools/images/BUILD
+++ b/tools/images/BUILD
@@ -1,4 +1,4 @@
-load("@rules_cc//cc:defs.bzl", "cc_binary")
+load("//tools:defs.bzl", "cc_binary")
 load("//tools/images:defs.bzl", "vm_image", "vm_test")
 
 package(
diff --git a/tools/images/defs.bzl b/tools/images/defs.bzl
index d8e422a5d..32235813a 100644
--- a/tools/images/defs.bzl
+++ b/tools/images/defs.bzl
@@ -28,6 +28,8 @@ The vm_test rule can be used to execute a command remotely. For example,
   )
 """
 
+load("//tools:defs.bzl", "default_installer")
+
 def _vm_image_impl(ctx):
     script_paths = []
     for script in ctx.files.scripts:
@@ -165,8 +167,8 @@ def vm_test(
     targets = kwargs.pop("targets", [])
     if installer:
         targets = [installer] + targets
-    targets = [
-    ] + targets
+    if default_installer():
+        targets = [default_installer()] + targets
     _vm_test(
         tags = [
             "local",
diff --git a/tools/issue_reviver/BUILD b/tools/issue_reviver/BUILD
index ee7ea11fd..4ef1a3124 100644
--- a/tools/issue_reviver/BUILD
+++ b/tools/issue_reviver/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
diff --git a/tools/issue_reviver/github/BUILD b/tools/issue_reviver/github/BUILD
index 6da22ba1c..da4133472 100644
--- a/tools/issue_reviver/github/BUILD
+++ b/tools/issue_reviver/github/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "github",
     srcs = ["github.go"],
-    importpath = "gvisor.dev/gvisor/tools/issue_reviver/github",
     visibility = [
         "//tools/issue_reviver:__subpackages__",
     ],
diff --git a/tools/issue_reviver/reviver/BUILD b/tools/issue_reviver/reviver/BUILD
index 2c3675977..d262932bd 100644
--- a/tools/issue_reviver/reviver/BUILD
+++ b/tools/issue_reviver/reviver/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "reviver",
     srcs = ["reviver.go"],
-    importpath = "gvisor.dev/gvisor/tools/issue_reviver/reviver",
     visibility = [
         "//tools/issue_reviver:__subpackages__",
     ],
@@ -15,5 +14,5 @@ go_test(
     name = "reviver_test",
     size = "small",
     srcs = ["reviver_test.go"],
-    embed = [":reviver"],
+    library = ":reviver",
 )
diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh
index fb09ff331..a22c8c9f2 100755
--- a/tools/workspace_status.sh
+++ b/tools/workspace_status.sh
@@ -15,4 +15,4 @@
 # limitations under the License.
 
 # The STABLE_ prefix will trigger a re-link if it changes.
-echo STABLE_VERSION $(git describe --always --tags --abbrev=12 --dirty)
+echo STABLE_VERSION $(git describe --always --tags --abbrev=12 --dirty || echo 0.0.0)
diff --git a/vdso/BUILD b/vdso/BUILD
index 2b6744c26..d37d4266d 100644
--- a/vdso/BUILD
+++ b/vdso/BUILD
@@ -3,20 +3,10 @@
 #   normal system VDSO (time, gettimeofday, clock_gettimeofday) but which uses
 #   timekeeping parameters managed by the sandbox kernel.
 
-load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier")
+load("//tools:defs.bzl", "cc_flags_supplier", "cc_toolchain", "select_arch")
 
 package(licenses = ["notice"])
 
-config_setting(
-    name = "x86_64",
-    constraint_values = ["@bazel_tools//platforms:x86_64"],
-)
-
-config_setting(
-    name = "aarch64",
-    constraint_values = ["@bazel_tools//platforms:aarch64"],
-)
-
 genrule(
     name = "vdso",
     srcs = [
@@ -39,14 +29,15 @@ genrule(
           "-O2 " +
           "-std=c++11 " +
           "-fPIC " +
+          "-fno-sanitize=all " +
           # Some toolchains enable stack protector by default. Disable it, the
           # VDSO has no hooks to handle failures.
           "-fno-stack-protector " +
           "-fuse-ld=gold " +
-          select({
-              ":x86_64": "-m64 ",
-              "//conditions:default": "",
-          }) +
+          select_arch(
+              amd64 = "-m64 ",
+              arm64 = "",
+          ) +
           "-shared " +
           "-nostdlib " +
           "-Wl,-soname=linux-vdso.so.1 " +
@@ -55,12 +46,10 @@ genrule(
           "-Wl,-Bsymbolic " +
           "-Wl,-z,max-page-size=4096 " +
           "-Wl,-z,common-page-size=4096 " +
-          select(
-              {
-                  ":x86_64": "-Wl,-T$(location vdso_amd64.lds) ",
-                  ":aarch64": "-Wl,-T$(location vdso_arm64.lds) ",
-              },
-              no_match_error = "Unsupported architecture",
+          select_arch(
+              amd64 = "-Wl,-T$(location vdso_amd64.lds) ",
+              arm64 = "-Wl,-T$(location vdso_arm64.lds) ",
+              no_match_error = "unsupported architecture",
           ) +
           "-o $(location vdso.so) " +
           "$(location vdso.cc) " +
@@ -73,7 +62,7 @@ genrule(
     ],
     features = ["-pie"],
     toolchains = [
-        "@bazel_tools//tools/cpp:current_cc_toolchain",
+        cc_toolchain,
         ":no_pie_cc_flags",
     ],
     visibility = ["//:sandbox"],
-- 
cgit v1.2.3


From 13c1f38dfa215ab3e3cc70642721f55ab226d5b7 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 27 Jan 2020 12:19:20 -0800
Subject: Update bug number for supporting extended attribute namespaces.

PiperOrigin-RevId: 291774815
---
 pkg/sentry/syscalls/linux/sys_xattr.go | 1 +
 test/syscalls/linux/xattr.cc           | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index e35c077d6..77deb8980 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -103,6 +103,7 @@ func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64)
 		return 0, "", err
 	}
 
+	// TODO(b/148380782): Support xattrs in namespaces other than "user".
 	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
 		return 0, "", syserror.EOPNOTSUPP
 	}
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index e77c355d7..ab21d68c6 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -131,7 +131,7 @@ TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
 }
 
 TEST_F(XattrTest, XattrTrustedWithNonadmin) {
-  // TODO(b/127675828): Support setxattr and getxattr with "trusted" prefix.
+  // TODO(b/148380782): Support setxattr and getxattr with "trusted" prefix.
   SKIP_IF(IsRunningOnGvisor());
   SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
 
-- 
cgit v1.2.3


From 5776a7b6f6b52faf6e0735c3f4a892639c1bd773 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 27 Jan 2020 18:26:26 -0800
Subject: Fix header ordering and format all C++ code.

PiperOrigin-RevId: 291844200
---
 CONTRIBUTING.md                                    |  3 +-
 test/syscalls/linux/32bit.cc                       |  2 +-
 test/syscalls/linux/fpsig_fork.cc                  |  4 +-
 test/syscalls/linux/fpsig_nested.cc                |  8 +--
 test/syscalls/linux/madvise.cc                     |  4 +-
 test/syscalls/linux/mempolicy.cc                   |  6 +-
 test/syscalls/linux/mlock.cc                       |  1 -
 test/syscalls/linux/msync.cc                       |  4 +-
 test/syscalls/linux/ptrace.cc                      |  3 +-
 test/syscalls/linux/seccomp.cc                     |  9 ++-
 test/syscalls/linux/sigaltstack.cc                 |  4 +-
 test/syscalls/linux/sigiret.cc                     |  4 +-
 test/syscalls/linux/socket_stream_blocking.cc      | 64 +++++++++++-----------
 test/syscalls/linux/stat.cc                        |  2 +-
 .../linux/udp_socket_errqueue_test_case.cc         |  3 +-
 test/util/capability_util.cc                       |  8 +--
 test/util/fs_util.cc                               |  2 +-
 test/util/multiprocess_util.h                      |  3 +-
 18 files changed, 68 insertions(+), 66 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5d46168bc..55a1ad0d9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -36,7 +36,8 @@ directory tree.
 
 All Go code should conform to the [Go style guidelines][gostyle]. C++ code
 should conform to the [Google C++ Style Guide][cppstyle] and the guidelines
-described for [tests][teststyle].
+described for [tests][teststyle]. Note that code may be automatically formatted
+per the guidelines when merged.
 
 As a secure runtime, we need to maintain the safety of all of code included in
 gVisor. The following rules help mitigate issues.
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index a7cbee06b..6a15d47e1 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -71,7 +71,7 @@ void ExitGroup32(const char instruction[2], int code) {
       "iretl\n"
       "int $3\n"
       :
-      : [code] "m"(code), [ip] "d"(m.ptr())
+      : [ code ] "m"(code), [ ip ] "d"(m.ptr())
       : "rax", "rbx", "rsp");
 }
 
diff --git a/test/syscalls/linux/fpsig_fork.cc b/test/syscalls/linux/fpsig_fork.cc
index e7e9f06a1..a346f1f00 100644
--- a/test/syscalls/linux/fpsig_fork.cc
+++ b/test/syscalls/linux/fpsig_fork.cc
@@ -76,8 +76,8 @@ TEST(FPSigTest, Fork) {
       "movl %[sig], %%edx;"
       "syscall;"
       :
-      : [killnr] "i"(__NR_tgkill), [parent] "rm"(parent),
-        [tid] "rm"(parent_tid), [sig] "i"(SIGUSR1)
+      : [ killnr ] "i"(__NR_tgkill), [ parent ] "rm"(parent),
+        [ tid ] "rm"(parent_tid), [ sig ] "i"(SIGUSR1)
       : "rax", "rdi", "rsi", "rdx",
         // Clobbered by syscall.
         "rcx", "r11");
diff --git a/test/syscalls/linux/fpsig_nested.cc b/test/syscalls/linux/fpsig_nested.cc
index 395463aed..c476a8e7a 100644
--- a/test/syscalls/linux/fpsig_nested.cc
+++ b/test/syscalls/linux/fpsig_nested.cc
@@ -61,8 +61,8 @@ void sigusr1(int s, siginfo_t* siginfo, void* _uc) {
       "movl %[sig], %%edx;"
       "syscall;"
       :
-      : [killnr] "i"(__NR_tgkill), [pid] "rm"(pid), [tid] "rm"(tid),
-        [sig] "i"(SIGUSR2)
+      : [ killnr ] "i"(__NR_tgkill), [ pid ] "rm"(pid), [ tid ] "rm"(tid),
+        [ sig ] "i"(SIGUSR2)
       : "rax", "rdi", "rsi", "rdx",
         // Clobbered by syscall.
         "rcx", "r11");
@@ -107,8 +107,8 @@ TEST(FPSigTest, NestedSignals) {
       "movl %[sig], %%edx;"
       "syscall;"
       :
-      : [killnr] "i"(__NR_tgkill), [pid] "rm"(pid), [tid] "rm"(tid),
-        [sig] "i"(SIGUSR1)
+      : [ killnr ] "i"(__NR_tgkill), [ pid ] "rm"(pid), [ tid ] "rm"(tid),
+        [ sig ] "i"(SIGUSR1)
       : "rax", "rdi", "rsi", "rdx",
         // Clobbered by syscall.
         "rcx", "r11");
diff --git a/test/syscalls/linux/madvise.cc b/test/syscalls/linux/madvise.cc
index 7fd0ea20c..dbd54ff2a 100644
--- a/test/syscalls/linux/madvise.cc
+++ b/test/syscalls/linux/madvise.cc
@@ -38,7 +38,7 @@ namespace testing {
 
 namespace {
 
-void ExpectAllMappingBytes(Mapping const& m, char c) {
+void ExpectAllMappingBytes(Mapping const &m, char c) {
   auto const v = m.view();
   for (size_t i = 0; i < kPageSize; i++) {
     ASSERT_EQ(v[i], c) << "at offset " << i;
@@ -47,7 +47,7 @@ void ExpectAllMappingBytes(Mapping const& m, char c) {
 
 // Equivalent to ExpectAllMappingBytes but async-signal-safe and with less
 // helpful failure messages.
-void CheckAllMappingBytes(Mapping const& m, char c) {
+void CheckAllMappingBytes(Mapping const &m, char c) {
   auto const v = m.view();
   for (size_t i = 0; i < kPageSize; i++) {
     TEST_CHECK_MSG(v[i] == c, "mapping contains wrong value");
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
index 9d5f47651..d21093899 100644
--- a/test/syscalls/linux/mempolicy.cc
+++ b/test/syscalls/linux/mempolicy.cc
@@ -213,7 +213,7 @@ TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
     }
   }
 
-  void* invalid_address = reinterpret_cast<void*>(-1);
+  void *invalid_address = reinterpret_cast<void *>(-1);
 
   // Invalid address.
   ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, invalid_address,
@@ -221,8 +221,8 @@ TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
               SyscallFailsWithErrno(EFAULT));
 
   // Invalid mode pointer.
-  ASSERT_THAT(get_mempolicy(reinterpret_cast<int*>(invalid_address), nullptr, 0,
-                            &dummy_stack_address, MPOL_F_ADDR | MPOL_F_NODE),
+  ASSERT_THAT(get_mempolicy(reinterpret_cast<int *>(invalid_address), nullptr,
+                            0, &dummy_stack_address, MPOL_F_ADDR | MPOL_F_NODE),
               SyscallFailsWithErrno(EFAULT));
 }
 
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
index 620b4f8b4..367a90fe1 100644
--- a/test/syscalls/linux/mlock.cc
+++ b/test/syscalls/linux/mlock.cc
@@ -60,7 +60,6 @@ bool IsPageMlocked(uintptr_t addr) {
   return true;
 }
 
-
 TEST(MlockTest, Basic) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
   auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc
index ac7146017..2b2b6aef9 100644
--- a/test/syscalls/linux/msync.cc
+++ b/test/syscalls/linux/msync.cc
@@ -60,9 +60,7 @@ std::vector<std::function<PosixErrorOr<Mapping>()>> SyncableMappings() {
     for (int const mflags : {MAP_PRIVATE, MAP_SHARED}) {
       int const prot = PROT_READ | (writable ? PROT_WRITE : 0);
       int const oflags = O_CREAT | (writable ? O_RDWR : O_RDONLY);
-      funcs.push_back([=] {
-        return MmapAnon(kPageSize, prot, mflags);
-      });
+      funcs.push_back([=] { return MmapAnon(kPageSize, prot, mflags); });
       funcs.push_back([=]() -> PosixErrorOr<Mapping> {
         std::string const path = NewTempAbsPath();
         ASSIGN_OR_RETURN_ERRNO(auto fd, Open(path, oflags, 0644));
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 8f3800380..ef67b747b 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -178,7 +178,8 @@ TEST(PtraceTest, GetSigMask) {
 
     // Install a signal handler for kBlockSignal to avoid termination and block
     // it.
-    TEST_PCHECK(signal(kBlockSignal, +[](int signo) {}) != SIG_ERR);
+    TEST_PCHECK(signal(
+                    kBlockSignal, +[](int signo) {}) != SIG_ERR);
     MaybeSave();
     TEST_PCHECK(sigprocmask(SIG_SETMASK, &blocked, nullptr) == 0);
     MaybeSave();
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 7e41fe7d8..294ee6808 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -113,7 +113,8 @@ TEST(SeccompTest, RetKillCausesDeathBySIGSYS) {
   pid_t const pid = fork();
   if (pid == 0) {
     // Register a signal handler for SIGSYS that we don't expect to be invoked.
-    RegisterSignalHandler(SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+    RegisterSignalHandler(
+        SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
     ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
     syscall(kFilteredSyscall);
     TEST_CHECK_MSG(false, "Survived invocation of test syscall");
@@ -132,7 +133,8 @@ TEST(SeccompTest, RetKillOnlyKillsOneThread) {
   pid_t const pid = fork();
   if (pid == 0) {
     // Register a signal handler for SIGSYS that we don't expect to be invoked.
-    RegisterSignalHandler(SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+    RegisterSignalHandler(
+        SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
     ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
     // Pass CLONE_VFORK to block the original thread in the child process until
     // the clone thread exits with SIGSYS.
@@ -346,7 +348,8 @@ TEST(SeccompTest, LeastPermissiveFilterReturnValueApplies) {
   // one that causes the kill that should be ignored.
   pid_t const pid = fork();
   if (pid == 0) {
-    RegisterSignalHandler(SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+    RegisterSignalHandler(
+        SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
     ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRACE);
     ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
     ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM);
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index 62b04ef1d..24e7c4960 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -168,8 +168,8 @@ TEST(SigaltstackTest, WalksOffBottom) {
 
   // Trigger a single fault.
   badhandler_low_water_mark =
-      static_cast<char*>(stack.ss_sp) + SIGSTKSZ;        // Expected top.
-  badhandler_recursive_faults = 0;                       // Disable refault.
+      static_cast<char*>(stack.ss_sp) + SIGSTKSZ;  // Expected top.
+  badhandler_recursive_faults = 0;                 // Disable refault.
   Fault();
   EXPECT_TRUE(badhandler_on_sigaltstack);
   EXPECT_THAT(sigaltstack(nullptr, &stack), SyscallSucceeds());
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
index a47c781ea..4deb1ae95 100644
--- a/test/syscalls/linux/sigiret.cc
+++ b/test/syscalls/linux/sigiret.cc
@@ -78,8 +78,8 @@ TEST(SigIretTest, CheckRcxR11) {
       "1: pause; cmpl $0, %[gotvtalrm]; je 1b;"  // while (!gotvtalrm);
       "movq %%rcx, %[rcx];"                      // rcx = %rcx
       "movq %%r11, %[r11];"                      // r11 = %r11
-      : [ready] "=m"(ready), [rcx] "+m"(rcx), [r11] "+m"(r11)
-      : [gotvtalrm] "m"(gotvtalrm)
+      : [ ready ] "=m"(ready), [ rcx ] "+m"(rcx), [ r11 ] "+m"(r11)
+      : [ gotvtalrm ] "m"(gotvtalrm)
       : "cc", "memory", "rcx", "r11");
 
   // If sigreturn(2) returns via 'sysret' then %rcx and %r11 will be
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
index e9cc082bf..538ee2268 100644
--- a/test/syscalls/linux/socket_stream_blocking.cc
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -32,38 +32,38 @@ namespace gvisor {
 namespace testing {
 
 TEST_P(BlockingStreamSocketPairTest, BlockPartialWriteClosed) {
-    // FIXME(b/35921550): gVisor doesn't support SO_SNDBUF on UDS, nor does it
-    // enforce any limit; it will write arbitrary amounts of data without
-    // blocking.
-    SKIP_IF(IsRunningOnGvisor());
-
-    auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-    int buffer_size;
-    socklen_t length = sizeof(buffer_size);
-    ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
-                           &buffer_size, &length),
-                SyscallSucceeds());
-
-    int wfd = sockets->first_fd();
-    ScopedThread t([wfd, buffer_size]() {
-      std::vector<char> buf(2 * buffer_size);
-      // Write more than fits in the buffer. Blocks then returns partial write
-      // when the other end is closed. The next call returns EPIPE.
-      //
-      // N.B. writes occur in chunks, so we may see less than buffer_size from
-      // the first call.
-      ASSERT_THAT(write(wfd, buf.data(), buf.size()),
-                  SyscallSucceedsWithValue(::testing::Gt(0)));
-      ASSERT_THAT(write(wfd, buf.data(), buf.size()),
-                  ::testing::AnyOf(SyscallFailsWithErrno(EPIPE),
-                                   SyscallFailsWithErrno(ECONNRESET)));
-    });
-
-    // Leave time for write to become blocked.
-    absl::SleepFor(absl::Seconds(1));
-
-    ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+  // FIXME(b/35921550): gVisor doesn't support SO_SNDBUF on UDS, nor does it
+  // enforce any limit; it will write arbitrary amounts of data without
+  // blocking.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int buffer_size;
+  socklen_t length = sizeof(buffer_size);
+  ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+                         &buffer_size, &length),
+              SyscallSucceeds());
+
+  int wfd = sockets->first_fd();
+  ScopedThread t([wfd, buffer_size]() {
+    std::vector<char> buf(2 * buffer_size);
+    // Write more than fits in the buffer. Blocks then returns partial write
+    // when the other end is closed. The next call returns EPIPE.
+    //
+    // N.B. writes occur in chunks, so we may see less than buffer_size from
+    // the first call.
+    ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+                SyscallSucceedsWithValue(::testing::Gt(0)));
+    ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+                ::testing::AnyOf(SyscallFailsWithErrno(EPIPE),
+                                 SyscallFailsWithErrno(ECONNRESET)));
+  });
+
+  // Leave time for write to become blocked.
+  absl::SleepFor(absl::Seconds(1));
+
+  ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
 }
 
 // Random save may interrupt the call to sendmsg() in SendLargeSendMsg(),
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 30de2f8ff..c1e45e10a 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -377,7 +377,7 @@ TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) {
   //
   // We need to support this because when a file is unlinked and we forward
   // the stat to the gofer it would return ENOENT.
-  const char* uncached_gofer = getenv("GVISOR_GOFER_UNCACHED");
+  const char *uncached_gofer = getenv("GVISOR_GOFER_UNCACHED");
   SKIP_IF(uncached_gofer != nullptr);
 
   // We don't support saving unlinked files.
diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
index 9a24e1df0..fcdba7279 100644
--- a/test/syscalls/linux/udp_socket_errqueue_test_case.cc
+++ b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
@@ -14,8 +14,6 @@
 
 #ifndef __fuchsia__
 
-#include "test/syscalls/linux/udp_socket_test_cases.h"
-
 #include <arpa/inet.h>
 #include <fcntl.h>
 #include <linux/errqueue.h>
@@ -29,6 +27,7 @@
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/udp_socket_test_cases.h"
 #include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
diff --git a/test/util/capability_util.cc b/test/util/capability_util.cc
index 5d733887b..9fee52fbb 100644
--- a/test/util/capability_util.cc
+++ b/test/util/capability_util.cc
@@ -36,10 +36,10 @@ PosixErrorOr<bool> CanCreateUserNamespace() {
   ASSIGN_OR_RETURN_ERRNO(
       auto child_stack,
       MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  int const child_pid =
-      clone(+[](void*) { return 0; },
-            reinterpret_cast<void*>(child_stack.addr() + kPageSize),
-            CLONE_NEWUSER | SIGCHLD, /* arg = */ nullptr);
+  int const child_pid = clone(
+      +[](void*) { return 0; },
+      reinterpret_cast<void*>(child_stack.addr() + kPageSize),
+      CLONE_NEWUSER | SIGCHLD, /* arg = */ nullptr);
   if (child_pid > 0) {
     int status;
     int const ret = waitpid(child_pid, &status, /* options = */ 0);
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index 042cec94a..052781445 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -452,7 +452,7 @@ PosixErrorOr<std::string> MakeAbsolute(absl::string_view filename,
 
 std::string CleanPath(const absl::string_view unclean_path) {
   std::string path = std::string(unclean_path);
-  const char *src = path.c_str();
+  const char* src = path.c_str();
   std::string::iterator dst = path.begin();
 
   // Check for absolute path and determine initial backtrack limit.
diff --git a/test/util/multiprocess_util.h b/test/util/multiprocess_util.h
index 3e736261b..2f3bf4a6f 100644
--- a/test/util/multiprocess_util.h
+++ b/test/util/multiprocess_util.h
@@ -99,7 +99,8 @@ inline PosixErrorOr<Cleanup> ForkAndExec(const std::string& filename,
                                          const ExecveArray& argv,
                                          const ExecveArray& envv, pid_t* child,
                                          int* execve_errno) {
-  return ForkAndExec(filename, argv, envv, [] {}, child, execve_errno);
+  return ForkAndExec(
+      filename, argv, envv, [] {}, child, execve_errno);
 }
 
 // Equivalent to ForkAndExec, except using dirfd and flags with execveat.
-- 
cgit v1.2.3


From 5d569408ef94c753b7aae9392b5e4ebf7e5ea50d Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 27 Jan 2020 22:27:57 -0800
Subject: Create platform_util for tests.

PiperOrigin-RevId: 291869423
---
 test/syscalls/linux/32bit.cc       | 130 +++++++++++++++++++++----------------
 test/syscalls/linux/BUILD          |   5 ++
 test/syscalls/linux/arch_prctl.cc  |   2 -
 test/syscalls/linux/concurrency.cc |   2 +
 test/syscalls/linux/exceptions.cc  |   2 +
 test/syscalls/linux/ptrace.cc      |  10 +--
 test/util/BUILD                    |  15 +++--
 test/util/platform_util.cc         |  49 ++++++++++++++
 test/util/platform_util.h          |  56 ++++++++++++++++
 test/util/test_util.cc             |  11 +---
 test/util/test_util.h              |  27 ++++----
 11 files changed, 216 insertions(+), 93 deletions(-)
 create mode 100644 test/util/platform_util.cc
 create mode 100644 test/util/platform_util.h

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 6a15d47e1..2751fb4e7 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -15,10 +15,12 @@
 #include <string.h>
 #include <sys/mman.h>
 
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
 #include "test/util/memory_util.h"
+#include "test/util/platform_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/test_util.h"
-#include "gtest/gtest.h"
 
 #ifndef __x86_64__
 #error "This test is x86-64 specific."
@@ -30,7 +32,6 @@ namespace testing {
 namespace {
 
 constexpr char kInt3 = '\xcc';
-
 constexpr char kInt80[2] = {'\xcd', '\x80'};
 constexpr char kSyscall[2] = {'\x0f', '\x05'};
 constexpr char kSysenter[2] = {'\x0f', '\x34'};
@@ -43,6 +44,7 @@ void ExitGroup32(const char instruction[2], int code) {
   // Fill with INT 3 in case we execute too far.
   memset(m.ptr(), kInt3, m.len());
 
+  // Copy in the actual instruction.
   memcpy(m.ptr(), instruction, 2);
 
   // We're playing *extremely* fast-and-loose with the various syscall ABIs
@@ -78,70 +80,87 @@ void ExitGroup32(const char instruction[2], int code) {
 constexpr int kExitCode = 42;
 
 TEST(Syscall32Bit, Int80) {
-  switch (GvisorPlatform()) {
-    case Platform::kKVM:
-      // TODO(b/111805002): 32-bit segments are broken (but not explictly
-      // disabled).
-      return;
-    case Platform::kPtrace:
-      // TODO(gvisor.dev/issue/167): The ptrace platform does not have a
-      // consistent story here.
-      return;
-    case Platform::kNative:
+  switch (PlatformSupport32Bit()) {
+    case PlatformSupport::NotSupported:
+      break;
+    case PlatformSupport::Segfault:
+      EXPECT_EXIT(ExitGroup32(kInt80, kExitCode),
+                  ::testing::KilledBySignal(SIGSEGV), "");
       break;
-  }
 
-  // Upstream Linux. 32-bit syscalls allowed.
-  EXPECT_EXIT(ExitGroup32(kInt80, kExitCode), ::testing::ExitedWithCode(42),
-              "");
-}
+    case PlatformSupport::Ignored:
+      // Since the call is ignored, we'll hit the int3 trap.
+      EXPECT_EXIT(ExitGroup32(kInt80, kExitCode),
+                  ::testing::KilledBySignal(SIGTRAP), "");
+      break;
 
-TEST(Syscall32Bit, Sysenter) {
-  switch (GvisorPlatform()) {
-    case Platform::kKVM:
-      // TODO(b/111805002): See above.
-      return;
-    case Platform::kPtrace:
-      // TODO(gvisor.dev/issue/167): See above.
-      return;
-    case Platform::kNative:
+    case PlatformSupport::Allowed:
+      EXPECT_EXIT(ExitGroup32(kInt80, kExitCode), ::testing::ExitedWithCode(42),
+                  "");
       break;
   }
+}
 
-  if (GetCPUVendor() == CPUVendor::kAMD) {
+TEST(Syscall32Bit, Sysenter) {
+  if (PlatformSupport32Bit() == PlatformSupport::Allowed &&
+      GetCPUVendor() == CPUVendor::kAMD) {
     // SYSENTER is an illegal instruction in compatibility mode on AMD.
     EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
                 ::testing::KilledBySignal(SIGILL), "");
     return;
   }
 
-  // Upstream Linux on !AMD, 32-bit syscalls allowed.
-  EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode), ::testing::ExitedWithCode(42),
-              "");
-}
+  switch (PlatformSupport32Bit()) {
+    case PlatformSupport::NotSupported:
+      break;
 
-TEST(Syscall32Bit, Syscall) {
-  switch (GvisorPlatform()) {
-    case Platform::kKVM:
-      // TODO(b/111805002): See above.
-      return;
-    case Platform::kPtrace:
-      // TODO(gvisor.dev/issue/167): See above.
-      return;
-    case Platform::kNative:
+    case PlatformSupport::Segfault:
+      EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
+                  ::testing::KilledBySignal(SIGSEGV), "");
+      break;
+
+    case PlatformSupport::Ignored:
+      // See above, except expected code is SIGSEGV.
+      EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
+                  ::testing::KilledBySignal(SIGSEGV), "");
+      break;
+
+    case PlatformSupport::Allowed:
+      EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
+                  ::testing::ExitedWithCode(42), "");
       break;
   }
+}
 
-  if (GetCPUVendor() == CPUVendor::kIntel) {
+TEST(Syscall32Bit, Syscall) {
+  if (PlatformSupport32Bit() == PlatformSupport::Allowed &&
+      GetCPUVendor() == CPUVendor::kIntel) {
     // SYSCALL is an illegal instruction in compatibility mode on Intel.
     EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
                 ::testing::KilledBySignal(SIGILL), "");
     return;
   }
 
-  // Upstream Linux on !Intel, 32-bit syscalls allowed.
-  EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode), ::testing::ExitedWithCode(42),
-              "");
+  switch (PlatformSupport32Bit()) {
+    case PlatformSupport::NotSupported:
+      break;
+
+    case PlatformSupport::Segfault:
+      EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
+                  ::testing::KilledBySignal(SIGSEGV), "");
+      break;
+
+    case PlatformSupport::Ignored:
+      // See above.
+      EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
+                  ::testing::KilledBySignal(SIGILL), "");
+      break;
+
+    case PlatformSupport::Allowed:
+      EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
+                  ::testing::ExitedWithCode(42), "");
+      break;
+  }
 }
 
 // Far call code called below.
@@ -205,19 +224,20 @@ void FarCall32() {
 }
 
 TEST(Call32Bit, Disallowed) {
-  switch (GvisorPlatform()) {
-    case Platform::kKVM:
-      // TODO(b/111805002): See above.
-      return;
-    case Platform::kPtrace:
-      // The ptrace platform cannot prevent switching to compatibility mode.
-      ABSL_FALLTHROUGH_INTENDED;
-    case Platform::kNative:
+  switch (PlatformSupport32Bit()) {
+    case PlatformSupport::NotSupported:
       break;
-  }
 
-  // Shouldn't crash.
-  FarCall32();
+    case PlatformSupport::Segfault:
+      EXPECT_EXIT(FarCall32(), ::testing::KilledBySignal(SIGSEGV), "");
+      break;
+
+    case PlatformSupport::Ignored:
+      ABSL_FALLTHROUGH_INTENDED;
+    case PlatformSupport::Allowed:
+      // Shouldn't crash.
+      FarCall32();
+  }
 }
 
 }  // namespace
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index c2ef50c1d..74bf068ec 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -197,9 +197,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:memory_util",
+        "//test/util:platform_util",
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -479,6 +481,7 @@ cc_binary(
     srcs = ["concurrency.cc"],
     linkstatic = 1,
     deps = [
+        "//test/util:platform_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
@@ -584,6 +587,7 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:logging",
+        "//test/util:platform_util",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
@@ -1658,6 +1662,7 @@ cc_binary(
     deps = [
         "//test/util:logging",
         "//test/util:multiprocess_util",
+        "//test/util:platform_util",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
diff --git a/test/syscalls/linux/arch_prctl.cc b/test/syscalls/linux/arch_prctl.cc
index 3a901faf5..81bf5a775 100644
--- a/test/syscalls/linux/arch_prctl.cc
+++ b/test/syscalls/linux/arch_prctl.cc
@@ -14,10 +14,8 @@
 
 #include <asm/prctl.h>
 #include <sys/prctl.h>
-#include <sys/syscall.h>
 
 #include "gtest/gtest.h"
-#include "test/util/file_descriptor.h"
 #include "test/util/test_util.h"
 
 // glibc does not provide a prototype for arch_prctl() so declare it here.
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
index 00b96b34a..f41f99900 100644
--- a/test/syscalls/linux/concurrency.cc
+++ b/test/syscalls/linux/concurrency.cc
@@ -20,6 +20,7 @@
 #include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "test/util/platform_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
@@ -99,6 +100,7 @@ TEST(ConcurrencyTest, MultiProcessMultithreaded) {
 // Test that multiple processes can execute concurrently, even if one process
 // never yields.
 TEST(ConcurrencyTest, MultiProcessConcurrency) {
+  SKIP_IF(PlatformSupportMultiProcess() == PlatformSupport::NotSupported);
 
   pid_t child_pid = fork();
   if (child_pid == 0) {
diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc
index 3d564e720..420b9543f 100644
--- a/test/syscalls/linux/exceptions.cc
+++ b/test/syscalls/linux/exceptions.cc
@@ -16,6 +16,7 @@
 
 #include "gtest/gtest.h"
 #include "test/util/logging.h"
+#include "test/util/platform_util.h"
 #include "test/util/signal_util.h"
 #include "test/util/test_util.h"
 
@@ -324,6 +325,7 @@ TEST(ExceptionTest, AlignmentHalt) {
 }
 
 TEST(ExceptionTest, AlignmentCheck) {
+  SKIP_IF(PlatformSupportAlignmentCheck() != PlatformSupport::Allowed);
 
   // See above.
   struct sigaction sa = {};
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index ef67b747b..4dd5cf27b 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -32,6 +32,7 @@
 #include "absl/time/time.h"
 #include "test/util/logging.h"
 #include "test/util/multiprocess_util.h"
+#include "test/util/platform_util.h"
 #include "test/util/signal_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@@ -824,13 +825,8 @@ TEST(PtraceTest,
 // These tests requires knowledge of architecture-specific syscall convention.
 #ifdef __x86_64__
 TEST(PtraceTest, Int3) {
-  switch (GvisorPlatform()) {
-    case Platform::kKVM:
-      // TODO(b/124248694): int3 isn't handled properly.
-      return;
-    default:
-      break;
-  }
+  SKIP_IF(PlatformSupportInt3() == PlatformSupport::NotSupported);
+
   pid_t const child_pid = fork();
   if (child_pid == 0) {
     // In child process.
diff --git a/test/util/BUILD b/test/util/BUILD
index 3c732be62..1ac8b3fd6 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -165,6 +165,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "platform_util",
+    testonly = 1,
+    srcs = ["platform_util.cc"],
+    hdrs = ["platform_util.h"],
+    deps = [":test_util"],
+)
+
 cc_library(
     name = "posix_error",
     testonly = 1,
@@ -238,12 +246,7 @@ cc_library(
         "test_util_runfiles.cc",
     ],
     hdrs = ["test_util.h"],
-    defines = select_system(
-        fuchsia = [
-            "__opensource__",
-            "__fuchsia__",
-        ],
-    ),
+    defines = select_system(),
     deps = [
         ":fs_util",
         ":logging",
diff --git a/test/util/platform_util.cc b/test/util/platform_util.cc
new file mode 100644
index 000000000..2724e63f3
--- /dev/null
+++ b/test/util/platform_util.cc
@@ -0,0 +1,49 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/platform_util.h"
+
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PlatformSupport PlatformSupport32Bit() {
+  if (GvisorPlatform() == Platform::kPtrace) {
+    return PlatformSupport::NotSupported;
+  } else if (GvisorPlatform() == Platform::kKVM) {
+    return PlatformSupport::Segfault;
+  } else {
+    return PlatformSupport::Allowed;
+  }
+}
+
+PlatformSupport PlatformSupportAlignmentCheck() {
+  return PlatformSupport::Allowed;
+}
+
+PlatformSupport PlatformSupportMultiProcess() {
+  return PlatformSupport::Allowed;
+}
+
+PlatformSupport PlatformSupportInt3() {
+  if (GvisorPlatform() == Platform::kKVM) {
+    return PlatformSupport::NotSupported;
+  } else {
+    return PlatformSupport::Allowed;
+  }
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/platform_util.h b/test/util/platform_util.h
new file mode 100644
index 000000000..28cc92371
--- /dev/null
+++ b/test/util/platform_util.h
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_PLATFORM_UTIL_H_
+#define GVISOR_TEST_UTIL_PLATFORM_UTIL_H_
+
+namespace gvisor {
+namespace testing {
+
+// PlatformSupport is a generic enumeration of classes of support.
+//
+// It is up to the individual functions and callers to agree on the precise
+// definition for each case. The document here generally refers to 32-bit
+// as an example. Many cases will use only NotSupported and Allowed.
+enum class PlatformSupport {
+  // The feature is not supported on the current platform.
+  //
+  // In the case of 32-bit, this means that calls will generally be interpreted
+  // as 64-bit calls, and there is no support for 32-bit binaries, long calls,
+  // etc. This usually means that the underlying implementation just pretends
+  // that 32-bit doesn't exist.
+  NotSupported,
+
+  // Calls will be ignored by the kernel with a fixed error.
+  Ignored,
+
+  // Calls will result in a SIGSEGV or similar fault.
+  Segfault,
+
+  // The feature is supported as expected.
+  //
+  // In the case of 32-bit, this means that the system call or far call will be
+  // handled properly.
+  Allowed,
+};
+
+PlatformSupport PlatformSupport32Bit();
+PlatformSupport PlatformSupportAlignmentCheck();
+PlatformSupport PlatformSupportMultiProcess();
+PlatformSupport PlatformSupportInt3();
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_PLATFORM_UTL_H_
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 848504c88..15cbc6da6 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -45,20 +45,13 @@ namespace testing {
 
 bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; }
 
-Platform GvisorPlatform() {
+const std::string GvisorPlatform() {
   // Set by runner.go.
   char* env = getenv(TEST_ON_GVISOR);
   if (!env) {
     return Platform::kNative;
   }
-  if (strcmp(env, "ptrace") == 0) {
-    return Platform::kPtrace;
-  }
-  if (strcmp(env, "kvm") == 0) {
-    return Platform::kKVM;
-  }
-  std::cerr << "unknown platform " << env;
-  abort();
+  return std::string(env);
 }
 
 bool IsRunningWithHostinet() {
diff --git a/test/util/test_util.h b/test/util/test_util.h
index b3235c7e3..2d22b0eb8 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -26,16 +26,13 @@
 // IsRunningOnGvisor returns true if the test is known to be running on gVisor.
 // GvisorPlatform can be used to get more detail:
 //
-//   switch (GvisorPlatform()) {
-//     case Platform::kNative:
-//     case Platform::kGvisor:
-//       EXPECT_THAT(mmap(...), SyscallSucceeds());
-//       break;
-//     case Platform::kPtrace:
-//       EXPECT_THAT(mmap(...), SyscallFailsWithErrno(ENOSYS));
-//       break;
+//   if (GvisorPlatform() == Platform::kPtrace) {
+//       ...
 //   }
 //
+// SetupGvisorDeathTest ensures that signal handling does not interfere with
+/// tests that rely on fatal signals.
+//
 // Matchers
 // ========
 //
@@ -213,13 +210,15 @@ void TestInit(int* argc, char*** argv);
     if (expr) GTEST_SKIP() << #expr; \
   } while (0)
 
-enum class Platform {
-  kNative,
-  kKVM,
-  kPtrace,
-};
+// Platform contains platform names.
+namespace Platform {
+constexpr char kNative[] = "native";
+constexpr char kPtrace[] = "ptrace";
+constexpr char kKVM[] = "kvm";
+}  // namespace Platform
+
 bool IsRunningOnGvisor();
-Platform GvisorPlatform();
+const std::string GvisorPlatform();
 bool IsRunningWithHostinet();
 
 #ifdef __linux__
-- 
cgit v1.2.3


From 74e04506a430535b7f3461eb35f36c9398db735a Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 28 Jan 2020 11:06:24 -0800
Subject: Prefer Type& over Type &

And Type* over Type *. This is basically a whitespace only change.

gVisor code already prefers left-alignment of pointers and references, but
clang-format formats for consistency with the majority of a file, and some
files leaned the wrong way. This is a one-time pass to make us completely
conforming.

Autogenerated with:

$ find . \( -name "*.cc" -or -name "*.c" -or -name "*.h" \) \
    | xargs clang-format -i -style="{BasedOnStyle: Google,  \
        DerivePointerAlignment: false, PointerAlignment: Left}"

PiperOrigin-RevId: 291972421
---
 test/syscalls/linux/connect_external.cc              | 12 ++++++------
 test/syscalls/linux/getrusage.cc                     |  2 +-
 test/syscalls/linux/iptables.h                       |  2 +-
 test/syscalls/linux/madvise.cc                       | 16 ++++++++--------
 test/syscalls/linux/mempolicy.cc                     | 16 ++++++++--------
 test/syscalls/linux/proc_net.cc                      | 20 ++++++++++----------
 test/syscalls/linux/sendfile_socket.cc               |  8 ++++----
 .../syscalls/linux/socket_bind_to_device_sequence.cc | 10 +++++-----
 test/syscalls/linux/socket_netdevice.cc              | 10 +++++-----
 test/syscalls/linux/stat.cc                          |  6 +++---
 test/util/mount_util.h                               |  8 ++++----
 vdso/syscalls.h                                      |  8 ++++----
 12 files changed, 59 insertions(+), 59 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/connect_external.cc b/test/syscalls/linux/connect_external.cc
index bfe1da82e..1edb50e47 100644
--- a/test/syscalls/linux/connect_external.cc
+++ b/test/syscalls/linux/connect_external.cc
@@ -56,7 +56,7 @@ TEST_P(GoferStreamSeqpacketTest, Echo) {
   ProtocolSocket proto;
   std::tie(env, proto) = GetParam();
 
-  char *val = getenv(env.c_str());
+  char* val = getenv(env.c_str());
   ASSERT_NE(val, nullptr);
   std::string root(val);
 
@@ -69,7 +69,7 @@ TEST_P(GoferStreamSeqpacketTest, Echo) {
   addr.sun_family = AF_UNIX;
   memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
 
-  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr *>(&addr),
+  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr*>(&addr),
                       sizeof(addr)),
               SyscallSucceeds());
 
@@ -92,7 +92,7 @@ TEST_P(GoferStreamSeqpacketTest, NonListening) {
   ProtocolSocket proto;
   std::tie(env, proto) = GetParam();
 
-  char *val = getenv(env.c_str());
+  char* val = getenv(env.c_str());
   ASSERT_NE(val, nullptr);
   std::string root(val);
 
@@ -105,7 +105,7 @@ TEST_P(GoferStreamSeqpacketTest, NonListening) {
   addr.sun_family = AF_UNIX;
   memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
 
-  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr *>(&addr),
+  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr*>(&addr),
                       sizeof(addr)),
               SyscallFailsWithErrno(ECONNREFUSED));
 }
@@ -127,7 +127,7 @@ using GoferDgramTest = ::testing::TestWithParam<std::string>;
 // unnamed. The server thus has no way to reply to us.
 TEST_P(GoferDgramTest, Null) {
   std::string env = GetParam();
-  char *val = getenv(env.c_str());
+  char* val = getenv(env.c_str());
   ASSERT_NE(val, nullptr);
   std::string root(val);
 
@@ -140,7 +140,7 @@ TEST_P(GoferDgramTest, Null) {
   addr.sun_family = AF_UNIX;
   memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
 
-  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr *>(&addr),
+  ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr*>(&addr),
                       sizeof(addr)),
               SyscallSucceeds());
 
diff --git a/test/syscalls/linux/getrusage.cc b/test/syscalls/linux/getrusage.cc
index 9bdb1e4cd..0e51d42a8 100644
--- a/test/syscalls/linux/getrusage.cc
+++ b/test/syscalls/linux/getrusage.cc
@@ -67,7 +67,7 @@ TEST(GetrusageTest, Grandchild) {
     pid = fork();
     if (pid == 0) {
       int flags = MAP_ANONYMOUS | MAP_POPULATE | MAP_PRIVATE;
-      void *addr =
+      void* addr =
           mmap(nullptr, kGrandchildSizeKb * 1024, PROT_WRITE, flags, -1, 0);
       TEST_PCHECK(addr != MAP_FAILED);
     } else {
diff --git a/test/syscalls/linux/iptables.h b/test/syscalls/linux/iptables.h
index 616bea550..0719c60a4 100644
--- a/test/syscalls/linux/iptables.h
+++ b/test/syscalls/linux/iptables.h
@@ -188,7 +188,7 @@ struct ipt_replace {
   unsigned int num_counters;
 
   // The unchanged values from each ipt_entry's counters.
-  struct xt_counters *counters;
+  struct xt_counters* counters;
 
   // The entries to write to the table. This will run past the size defined by
   // sizeof(srtuct ipt_replace);
diff --git a/test/syscalls/linux/madvise.cc b/test/syscalls/linux/madvise.cc
index dbd54ff2a..5a1973f60 100644
--- a/test/syscalls/linux/madvise.cc
+++ b/test/syscalls/linux/madvise.cc
@@ -38,7 +38,7 @@ namespace testing {
 
 namespace {
 
-void ExpectAllMappingBytes(Mapping const &m, char c) {
+void ExpectAllMappingBytes(Mapping const& m, char c) {
   auto const v = m.view();
   for (size_t i = 0; i < kPageSize; i++) {
     ASSERT_EQ(v[i], c) << "at offset " << i;
@@ -47,7 +47,7 @@ void ExpectAllMappingBytes(Mapping const &m, char c) {
 
 // Equivalent to ExpectAllMappingBytes but async-signal-safe and with less
 // helpful failure messages.
-void CheckAllMappingBytes(Mapping const &m, char c) {
+void CheckAllMappingBytes(Mapping const& m, char c) {
   auto const v = m.view();
   for (size_t i = 0; i < kPageSize; i++) {
     TEST_CHECK_MSG(v[i] == c, "mapping contains wrong value");
@@ -139,7 +139,7 @@ TEST(MadviseDontneedTest, IgnoresPermissions) {
 TEST(MadviseDontforkTest, AddressLength) {
   auto m =
       ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
-  char *addr = static_cast<char *>(m.ptr());
+  char* addr = static_cast<char*>(m.ptr());
 
   // Address must be page aligned.
   EXPECT_THAT(madvise(addr + 1, kPageSize, MADV_DONTFORK),
@@ -168,9 +168,9 @@ TEST(MadviseDontforkTest, DontforkShared) {
   Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
       nullptr, kPageSize * 2, PROT_READ | PROT_WRITE, MAP_SHARED, fd.get(), 0));
 
-  const Mapping ms1 = Mapping(reinterpret_cast<void *>(m.addr()), kPageSize);
+  const Mapping ms1 = Mapping(reinterpret_cast<void*>(m.addr()), kPageSize);
   const Mapping ms2 =
-      Mapping(reinterpret_cast<void *>(m.addr() + kPageSize), kPageSize);
+      Mapping(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize);
   m.release();
 
   ASSERT_THAT(madvise(ms2.ptr(), kPageSize, MADV_DONTFORK), SyscallSucceeds());
@@ -197,11 +197,11 @@ TEST(MadviseDontforkTest, DontforkAnonPrivate) {
   // Mmap three anonymous pages and MADV_DONTFORK the middle page.
   Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
       MmapAnon(kPageSize * 3, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  const Mapping mp1 = Mapping(reinterpret_cast<void *>(m.addr()), kPageSize);
+  const Mapping mp1 = Mapping(reinterpret_cast<void*>(m.addr()), kPageSize);
   const Mapping mp2 =
-      Mapping(reinterpret_cast<void *>(m.addr() + kPageSize), kPageSize);
+      Mapping(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize);
   const Mapping mp3 =
-      Mapping(reinterpret_cast<void *>(m.addr() + 2 * kPageSize), kPageSize);
+      Mapping(reinterpret_cast<void*>(m.addr() + 2 * kPageSize), kPageSize);
   m.release();
 
   ASSERT_THAT(madvise(mp2.ptr(), kPageSize, MADV_DONTFORK), SyscallSucceeds());
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
index d21093899..059fad598 100644
--- a/test/syscalls/linux/mempolicy.cc
+++ b/test/syscalls/linux/mempolicy.cc
@@ -43,17 +43,17 @@ namespace {
 #define MPOL_MF_MOVE (1 << 1)
 #define MPOL_MF_MOVE_ALL (1 << 2)
 
-int get_mempolicy(int *policy, uint64_t *nmask, uint64_t maxnode, void *addr,
+int get_mempolicy(int* policy, uint64_t* nmask, uint64_t maxnode, void* addr,
                   int flags) {
   return syscall(SYS_get_mempolicy, policy, nmask, maxnode, addr, flags);
 }
 
-int set_mempolicy(int mode, uint64_t *nmask, uint64_t maxnode) {
+int set_mempolicy(int mode, uint64_t* nmask, uint64_t maxnode) {
   return syscall(SYS_set_mempolicy, mode, nmask, maxnode);
 }
 
-int mbind(void *addr, unsigned long len, int mode,
-          const unsigned long *nodemask, unsigned long maxnode,
+int mbind(void* addr, unsigned long len, int mode,
+          const unsigned long* nodemask, unsigned long maxnode,
           unsigned flags) {
   return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
 }
@@ -68,7 +68,7 @@ Cleanup ScopedMempolicy() {
 
 // Temporarily change the memory policy for the calling thread within the
 // caller's scope.
-PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64_t *nmask,
+PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64_t* nmask,
                                          uint64_t maxnode) {
   if (set_mempolicy(mode, nmask, maxnode)) {
     return PosixError(errno, "set_mempolicy");
@@ -213,7 +213,7 @@ TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
     }
   }
 
-  void *invalid_address = reinterpret_cast<void *>(-1);
+  void* invalid_address = reinterpret_cast<void*>(-1);
 
   // Invalid address.
   ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, invalid_address,
@@ -221,8 +221,8 @@ TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
               SyscallFailsWithErrno(EFAULT));
 
   // Invalid mode pointer.
-  ASSERT_THAT(get_mempolicy(reinterpret_cast<int *>(invalid_address), nullptr,
-                            0, &dummy_stack_address, MPOL_F_ADDR | MPOL_F_NODE),
+  ASSERT_THAT(get_mempolicy(reinterpret_cast<int*>(invalid_address), nullptr, 0,
+                            &dummy_stack_address, MPOL_F_ADDR | MPOL_F_NODE),
               SyscallFailsWithErrno(EFAULT));
 }
 
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 65bad06d4..3a611a86f 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -68,8 +68,8 @@ TEST(ProcSysNetIpv4Sack, CanReadAndWrite) {
 }
 
 PosixErrorOr<uint64_t> GetSNMPMetricFromProc(const std::string snmp,
-                                             const std::string &type,
-                                             const std::string &item) {
+                                             const std::string& type,
+                                             const std::string& item) {
   std::vector<std::string> snmp_vec = absl::StrSplit(snmp, '\n');
 
   // /proc/net/snmp prints a line of headers followed by a line of metrics.
@@ -127,7 +127,7 @@ TEST(ProcNetSnmp, TcpReset_NoRandomSave) {
   };
 
   ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
-  ASSERT_THAT(connect(s.get(), (struct sockaddr *)&sin, sizeof(sin)),
+  ASSERT_THAT(connect(s.get(), (struct sockaddr*)&sin, sizeof(sin)),
               SyscallFailsWithErrno(ECONNREFUSED));
 
   uint64_t newAttemptFails;
@@ -172,19 +172,19 @@ TEST(ProcNetSnmp, TcpEstab_NoRandomSave) {
   };
 
   ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
-  ASSERT_THAT(bind(s_listen.get(), (struct sockaddr *)&sin, sizeof(sin)),
+  ASSERT_THAT(bind(s_listen.get(), (struct sockaddr*)&sin, sizeof(sin)),
               SyscallSucceeds());
   ASSERT_THAT(listen(s_listen.get(), 1), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
   socklen_t addrlen = sizeof(sin);
   ASSERT_THAT(
-      getsockname(s_listen.get(), reinterpret_cast<sockaddr *>(&sin), &addrlen),
+      getsockname(s_listen.get(), reinterpret_cast<sockaddr*>(&sin), &addrlen),
       SyscallSucceeds());
 
   FileDescriptor s_connect =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
-  ASSERT_THAT(connect(s_connect.get(), (struct sockaddr *)&sin, sizeof(sin)),
+  ASSERT_THAT(connect(s_connect.get(), (struct sockaddr*)&sin, sizeof(sin)),
               SyscallSucceeds());
 
   auto s_accept =
@@ -260,7 +260,7 @@ TEST(ProcNetSnmp, UdpNoPorts_NoRandomSave) {
       .sin_port = htons(4444),
   };
   ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
-  ASSERT_THAT(sendto(s.get(), "a", 1, 0, (struct sockaddr *)&sin, sizeof(sin)),
+  ASSERT_THAT(sendto(s.get(), "a", 1, 0, (struct sockaddr*)&sin, sizeof(sin)),
               SyscallSucceedsWithValue(1));
 
   uint64_t newOutDatagrams;
@@ -295,18 +295,18 @@ TEST(ProcNetSnmp, UdpIn) {
       .sin_port = htons(0),
   };
   ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
-  ASSERT_THAT(bind(server.get(), (struct sockaddr *)&sin, sizeof(sin)),
+  ASSERT_THAT(bind(server.get(), (struct sockaddr*)&sin, sizeof(sin)),
               SyscallSucceeds());
   // Get the port bound by the server socket.
   socklen_t addrlen = sizeof(sin);
   ASSERT_THAT(
-      getsockname(server.get(), reinterpret_cast<sockaddr *>(&sin), &addrlen),
+      getsockname(server.get(), reinterpret_cast<sockaddr*>(&sin), &addrlen),
       SyscallSucceeds());
 
   FileDescriptor client =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
   ASSERT_THAT(
-      sendto(client.get(), "a", 1, 0, (struct sockaddr *)&sin, sizeof(sin)),
+      sendto(client.get(), "a", 1, 0, (struct sockaddr*)&sin, sizeof(sin)),
       SyscallSucceedsWithValue(1));
 
   char buf[128];
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index 3331288b7..8f7ee4163 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -41,15 +41,15 @@ class SendFileTest : public ::testing::TestWithParam<int> {
     struct sockaddr server_addr = {};
     switch (family) {
       case AF_INET: {
-        struct sockaddr_in *server_addr_in =
-            reinterpret_cast<struct sockaddr_in *>(&server_addr);
+        struct sockaddr_in* server_addr_in =
+            reinterpret_cast<struct sockaddr_in*>(&server_addr);
         server_addr_in->sin_family = family;
         server_addr_in->sin_addr.s_addr = INADDR_ANY;
         break;
       }
       case AF_UNIX: {
-        struct sockaddr_un *server_addr_un =
-            reinterpret_cast<struct sockaddr_un *>(&server_addr);
+        struct sockaddr_un* server_addr_un =
+            reinterpret_cast<struct sockaddr_un*>(&server_addr);
         server_addr_un->sun_family = family;
         server_addr_un->sun_path[0] = '\0';
         break;
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index 34b1058a9..637d1151a 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -66,7 +66,7 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
   // Gets a device by device_id.  If the device_id has been seen before, returns
   // the previously returned device.  If not, finds or creates a new device.
   // Returns an empty string on failure.
-  void GetDevice(int device_id, string *device_name) {
+  void GetDevice(int device_id, string* device_name) {
     auto device = devices_.find(device_id);
     if (device != devices_.end()) {
       *device_name = device->second;
@@ -112,7 +112,7 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
   // Sets the socket_id to uniquely identify the socket bound if it is not
   // nullptr.
   void BindSocket(bool reuse_port, bool reuse_addr, int device_id = 0,
-                  int want = 0, int *socket_id = nullptr) {
+                  int want = 0, int* socket_id = nullptr) {
     next_socket_id_++;
     sockets_to_close_[next_socket_id_] = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
     auto socket_fd = sockets_to_close_[next_socket_id_]->get();
@@ -154,12 +154,12 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
     addr.sin_port = port_;
     if (want == 0) {
       ASSERT_THAT(
-          bind(socket_fd, reinterpret_cast<const struct sockaddr *>(&addr),
+          bind(socket_fd, reinterpret_cast<const struct sockaddr*>(&addr),
                sizeof(addr)),
           SyscallSucceeds());
     } else {
       ASSERT_THAT(
-          bind(socket_fd, reinterpret_cast<const struct sockaddr *>(&addr),
+          bind(socket_fd, reinterpret_cast<const struct sockaddr*>(&addr),
                sizeof(addr)),
           SyscallFailsWithErrno(want));
     }
@@ -169,7 +169,7 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
       // remember it for future commands.
       socklen_t addr_size = sizeof(addr);
       ASSERT_THAT(
-          getsockname(socket_fd, reinterpret_cast<struct sockaddr *>(&addr),
+          getsockname(socket_fd, reinterpret_cast<struct sockaddr*>(&addr),
                       &addr_size),
           SyscallSucceeds());
       port_ = addr.sin_port;
diff --git a/test/syscalls/linux/socket_netdevice.cc b/test/syscalls/linux/socket_netdevice.cc
index 405dbbd73..15d4b85a7 100644
--- a/test/syscalls/linux/socket_netdevice.cc
+++ b/test/syscalls/linux/socket_netdevice.cc
@@ -91,7 +91,7 @@ TEST(NetdeviceTest, Netmask) {
   int prefixlen = -1;
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
-      [&](const struct nlmsghdr *hdr) {
+      [&](const struct nlmsghdr* hdr) {
         EXPECT_THAT(hdr->nlmsg_type, AnyOf(Eq(RTM_NEWADDR), Eq(NLMSG_DONE)));
 
         EXPECT_TRUE((hdr->nlmsg_flags & NLM_F_MULTI) == NLM_F_MULTI)
@@ -107,8 +107,8 @@ TEST(NetdeviceTest, Netmask) {
         // RTM_NEWADDR contains at least the header and ifaddrmsg.
         EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct ifaddrmsg));
 
-        struct ifaddrmsg *ifaddrmsg =
-            reinterpret_cast<struct ifaddrmsg *>(NLMSG_DATA(hdr));
+        struct ifaddrmsg* ifaddrmsg =
+            reinterpret_cast<struct ifaddrmsg*>(NLMSG_DATA(hdr));
         if (ifaddrmsg->ifa_index == static_cast<uint32_t>(ifr.ifr_ifindex) &&
             ifaddrmsg->ifa_family == AF_INET) {
           prefixlen = ifaddrmsg->ifa_prefixlen;
@@ -127,8 +127,8 @@ TEST(NetdeviceTest, Netmask) {
   snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
   ASSERT_THAT(ioctl(sock.get(), SIOCGIFNETMASK, &ifr), SyscallSucceeds());
   EXPECT_EQ(ifr.ifr_netmask.sa_family, AF_INET);
-  struct sockaddr_in *sin =
-      reinterpret_cast<struct sockaddr_in *>(&ifr.ifr_netmask);
+  struct sockaddr_in* sin =
+      reinterpret_cast<struct sockaddr_in*>(&ifr.ifr_netmask);
   EXPECT_EQ(sin->sin_addr.s_addr, mask);
 }
 
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index c1e45e10a..388d75835 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -377,7 +377,7 @@ TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) {
   //
   // We need to support this because when a file is unlinked and we forward
   // the stat to the gofer it would return ENOENT.
-  const char *uncached_gofer = getenv("GVISOR_GOFER_UNCACHED");
+  const char* uncached_gofer = getenv("GVISOR_GOFER_UNCACHED");
   SKIP_IF(uncached_gofer != nullptr);
 
   // We don't support saving unlinked files.
@@ -599,8 +599,8 @@ struct kernel_statx {
   uint64_t __spare2[14];
 };
 
-int statx(int dirfd, const char *pathname, int flags, unsigned int mask,
-          struct kernel_statx *statxbuf) {
+int statx(int dirfd, const char* pathname, int flags, unsigned int mask,
+          struct kernel_statx* statxbuf) {
   return syscall(SYS_statx, dirfd, pathname, flags, mask, statxbuf);
 }
 
diff --git a/test/util/mount_util.h b/test/util/mount_util.h
index 23eea51a2..09e2281eb 100644
--- a/test/util/mount_util.h
+++ b/test/util/mount_util.h
@@ -31,10 +31,10 @@ namespace testing {
 
 // Mount mounts the filesystem, and unmounts when the returned reference is
 // destroyed.
-inline PosixErrorOr<Cleanup> Mount(const std::string &source,
-                                   const std::string &target,
-                                   const std::string &fstype,
-                                   uint64_t mountflags, const std::string &data,
+inline PosixErrorOr<Cleanup> Mount(const std::string& source,
+                                   const std::string& target,
+                                   const std::string& fstype,
+                                   uint64_t mountflags, const std::string& data,
                                    uint64_t umountflags) {
   if (mount(source.c_str(), target.c_str(), fstype.c_str(), mountflags,
             data.c_str()) == -1) {
diff --git a/vdso/syscalls.h b/vdso/syscalls.h
index f5865bb72..b6d15a7d3 100644
--- a/vdso/syscalls.h
+++ b/vdso/syscalls.h
@@ -65,8 +65,8 @@ static inline int sys_rt_sigreturn(void) {
   return num;
 }
 
-static inline int sys_clock_gettime(clockid_t _clkid, struct timespec *_ts) {
-  register struct timespec *ts asm("x1") = _ts;
+static inline int sys_clock_gettime(clockid_t _clkid, struct timespec* _ts) {
+  register struct timespec* ts asm("x1") = _ts;
   register clockid_t clkid asm("x0") = _clkid;
   register long ret asm("x0");
   register long nr asm("x8") = __NR_clock_gettime;
@@ -78,8 +78,8 @@ static inline int sys_clock_gettime(clockid_t _clkid, struct timespec *_ts) {
   return ret;
 }
 
-static inline int sys_clock_getres(clockid_t _clkid, struct timespec *_ts) {
-  register struct timespec *ts asm("x1") = _ts;
+static inline int sys_clock_getres(clockid_t _clkid, struct timespec* _ts) {
+  register struct timespec* ts asm("x1") = _ts;
   register clockid_t clkid asm("x0") = _clkid;
   register long ret asm("x0");
   register long nr asm("x8") = __NR_clock_getres;
-- 
cgit v1.2.3


From 76483b8b1ec4ee1fb6b6efb6bdcfaf6dba7be4ce Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 28 Jan 2020 11:12:01 -0800
Subject: Check sigsetsize in rt_sigaction

This isn't in the libc wrapper, but it is in the syscall itself.

Discovered by @xiaobo55x in #1625.

PiperOrigin-RevId: 291973931
---
 pkg/sentry/strace/linux64_amd64.go      |  2 +-
 pkg/sentry/strace/linux64_arm64.go      |  2 +-
 pkg/sentry/syscalls/linux/sys_signal.go |  5 ++++
 test/syscalls/linux/sigaction.cc        | 53 +++++++++++++++++++--------------
 4 files changed, 38 insertions(+), 24 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go
index 1e823b685..85ec66fd3 100644
--- a/pkg/sentry/strace/linux64_amd64.go
+++ b/pkg/sentry/strace/linux64_amd64.go
@@ -37,7 +37,7 @@ var linuxAMD64 = SyscallMap{
 	10:  makeSyscallInfo("mprotect", Hex, Hex, Hex),
 	11:  makeSyscallInfo("munmap", Hex, Hex),
 	12:  makeSyscallInfo("brk", Hex),
-	13:  makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction),
+	13:  makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction, Hex),
 	14:  makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex),
 	15:  makeSyscallInfo("rt_sigreturn"),
 	16:  makeSyscallInfo("ioctl", FD, Hex, Hex),
diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go
index c3ac5248d..8bc38545f 100644
--- a/pkg/sentry/strace/linux64_arm64.go
+++ b/pkg/sentry/strace/linux64_arm64.go
@@ -158,7 +158,7 @@ var linuxARM64 = SyscallMap{
 	131: makeSyscallInfo("tgkill", Hex, Hex, Signal),
 	132: makeSyscallInfo("sigaltstack", Hex, Hex),
 	133: makeSyscallInfo("rt_sigsuspend", Hex),
-	134: makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction),
+	134: makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction, Hex),
 	135: makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex),
 	136: makeSyscallInfo("rt_sigpending", Hex),
 	137: makeSyscallInfo("rt_sigtimedwait", SigSet, Hex, Timespec, Hex),
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index 209be2990..7e1747a0c 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -245,6 +245,11 @@ func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 	sig := linux.Signal(args[0].Int())
 	newactarg := args[1].Pointer()
 	oldactarg := args[2].Pointer()
+	sigsetsize := args[3].SizeT()
+
+	if sigsetsize != linux.SignalSetSize {
+		return 0, nil, syserror.EINVAL
+	}
 
 	var newactptr *arch.SignalAct
 	if newactarg != 0 {
diff --git a/test/syscalls/linux/sigaction.cc b/test/syscalls/linux/sigaction.cc
index 9a53fd3e0..9d9dd57a8 100644
--- a/test/syscalls/linux/sigaction.cc
+++ b/test/syscalls/linux/sigaction.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <signal.h>
+#include <sys/syscall.h>
 
 #include "gtest/gtest.h"
 #include "test/util/test_util.h"
@@ -23,45 +24,53 @@ namespace testing {
 namespace {
 
 TEST(SigactionTest, GetLessThanOrEqualToZeroFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(-1, NULL, &act), SyscallFailsWithErrno(EINVAL));
-  ASSERT_THAT(sigaction(0, NULL, &act), SyscallFailsWithErrno(EINVAL));
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(-1, nullptr, &act), SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(sigaction(0, nullptr, &act), SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(SigactionTest, SetLessThanOrEqualToZeroFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(0, &act, NULL), SyscallFailsWithErrno(EINVAL));
-  ASSERT_THAT(sigaction(0, &act, NULL), SyscallFailsWithErrno(EINVAL));
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(0, &act, nullptr), SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(sigaction(0, &act, nullptr), SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(SigactionTest, GetGreaterThanMaxFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(SIGRTMAX + 1, NULL, &act),
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(SIGRTMAX + 1, nullptr, &act),
               SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(SigactionTest, SetGreaterThanMaxFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(SIGRTMAX + 1, &act, NULL),
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(SIGRTMAX + 1, &act, nullptr),
               SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(SigactionTest, SetSigkillFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(SIGKILL, NULL, &act), SyscallSucceeds());
-  ASSERT_THAT(sigaction(SIGKILL, &act, NULL), SyscallFailsWithErrno(EINVAL));
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(SIGKILL, nullptr, &act), SyscallSucceeds());
+  ASSERT_THAT(sigaction(SIGKILL, &act, nullptr), SyscallFailsWithErrno(EINVAL));
 }
 
 TEST(SigactionTest, SetSigstopFails) {
-  struct sigaction act;
-  memset(&act, 0, sizeof(act));
-  ASSERT_THAT(sigaction(SIGSTOP, NULL, &act), SyscallSucceeds());
-  ASSERT_THAT(sigaction(SIGSTOP, &act, NULL), SyscallFailsWithErrno(EINVAL));
+  struct sigaction act = {};
+  ASSERT_THAT(sigaction(SIGSTOP, nullptr, &act), SyscallSucceeds());
+  ASSERT_THAT(sigaction(SIGSTOP, &act, nullptr), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, BadSigsetFails) {
+  constexpr size_t kWrongSigSetSize = 43;
+
+  struct sigaction act = {};
+
+  // The syscall itself (rather than the libc wrapper) takes the sigset_t size.
+  ASSERT_THAT(
+      syscall(SYS_rt_sigaction, SIGTERM, nullptr, &act, kWrongSigSetSize),
+      SyscallFailsWithErrno(EINVAL));
+  ASSERT_THAT(
+      syscall(SYS_rt_sigaction, SIGTERM, &act, nullptr, kWrongSigSetSize),
+      SyscallFailsWithErrno(EINVAL));
 }
 
 }  // namespace
-- 
cgit v1.2.3


From d99329e58492ef91b44a0bac346f757e8af2a7ec Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antfin.com>
Date: Tue, 28 Jan 2020 12:31:58 -0800
Subject: netlink: add support for RTM_F_LOOKUP_TABLE

Test command:
  $ ip route get 1.1.1.1

Fixes: #1099

Signed-off-by: Jianfeng Tan <henry.tjf@antfin.com>
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/1121 from tanjianfeng:fix-1099 e6919f3d4ede5aa51a48b3d2be0d7a4b482dd53d
PiperOrigin-RevId: 291990716
---
 pkg/abi/linux/netlink_route.go              |  13 +++
 pkg/sentry/socket/netlink/route/BUILD       |   6 +-
 pkg/sentry/socket/netlink/route/protocol.go | 158 +++++++++++++++++++++++++---
 test/syscalls/linux/socket_netlink_route.cc |  84 +++++++++++++++
 test/syscalls/linux/socket_netlink_util.cc  |  38 +++++++
 test/syscalls/linux/socket_netlink_util.h   |  11 +-
 6 files changed, 295 insertions(+), 15 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go
index 0e3582ab6..40bec566c 100644
--- a/pkg/abi/linux/netlink_route.go
+++ b/pkg/abi/linux/netlink_route.go
@@ -205,6 +205,9 @@ type RouteMessage struct {
 	Flags uint32
 }
 
+// SizeOfRouteMessage is the size of RouteMessage.
+const SizeOfRouteMessage = 12
+
 // Route types, from uapi/linux/rtnetlink.h.
 const (
 	// RTN_UNSPEC represents an unspecified route type.
@@ -331,3 +334,13 @@ const (
 	RTF_GATEWAY = 0x2
 	RTF_UP      = 0x1
 )
+
+// RtAttr is the header of optional addition route information, as a netlink
+// attribute. From include/uapi/linux/rtnetlink.h.
+type RtAttr struct {
+	Len  uint16
+	Type uint16
+}
+
+// SizeOfRtAttr is the size of RtAttr.
+const SizeOfRtAttr = 4
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 0234aadde..622a1eafc 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -4,15 +4,19 @@ package(licenses = ["notice"])
 
 go_library(
     name = "route",
-    srcs = ["protocol.go"],
+    srcs = [
+        "protocol.go",
+    ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/netlink",
         "//pkg/syserr",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 80a15d6cb..2b3c7f5b3 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -19,12 +19,14 @@ import (
 	"bytes"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // commandKind describes the operational class of a message type.
@@ -66,8 +68,14 @@ func (p *Protocol) CanSend() bool {
 	return true
 }
 
-// dumpLinks handles RTM_GETLINK + NLM_F_DUMP requests.
+// dumpLinks handles RTM_GETLINK dump requests.
 func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+	// TODO(b/68878065): Only the dump variant of the types below are
+	// supported.
+	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
+		return syserr.ErrNotSupported
+	}
+
 	// NLM_F_DUMP + RTM_GETLINK messages are supposed to include an
 	// ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some
 	// userspace applications (including glibc) still include rtgenmsg.
@@ -121,8 +129,14 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 	return nil
 }
 
-// dumpAddrs handles RTM_GETADDR + NLM_F_DUMP requests.
+// dumpAddrs handles RTM_GETADDR dump requests.
 func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+	// TODO(b/68878065): Only the dump variant of the types below are
+	// supported.
+	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
+		return syserr.ErrNotSupported
+	}
+
 	// RTM_GETADDR dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -163,22 +177,146 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 	return nil
 }
 
-// dumpRoutes handles RTM_GETROUTE + NLM_F_DUMP requests.
+// commonPrefixLen reports the length of the longest IP address prefix.
+// This is a simplied version from Golang's src/net/addrselect.go.
+func commonPrefixLen(a, b []byte) (cpl int) {
+	for len(a) > 0 {
+		if a[0] == b[0] {
+			cpl += 8
+			a = a[1:]
+			b = b[1:]
+			continue
+		}
+		bits := 8
+		ab, bb := a[0], b[0]
+		for {
+			ab >>= 1
+			bb >>= 1
+			bits--
+			if ab == bb {
+				cpl += bits
+				return
+			}
+		}
+	}
+	return
+}
+
+// fillRoute returns the Route using LPM algorithm. Refer to Linux's
+// net/ipv4/route.c:rt_fill_info().
+func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) {
+	family := uint8(linux.AF_INET)
+	if len(addr) != 4 {
+		family = linux.AF_INET6
+	}
+
+	idx := -1    // Index of the Route rule to be returned.
+	idxDef := -1 // Index of the default route rule.
+	prefix := 0  // Current longest prefix.
+	for i, route := range routes {
+		if route.Family != family {
+			continue
+		}
+
+		if len(route.GatewayAddr) > 0 && route.DstLen == 0 {
+			idxDef = i
+			continue
+		}
+
+		cpl := commonPrefixLen(addr, route.DstAddr)
+		if cpl < int(route.DstLen) {
+			continue
+		}
+		cpl = int(route.DstLen)
+		if cpl > prefix {
+			idx = i
+			prefix = cpl
+		}
+	}
+	if idx == -1 {
+		idx = idxDef
+	}
+	if idx == -1 {
+		return inet.Route{}, syserr.ErrNoRoute
+	}
+
+	route := routes[idx]
+	if family == linux.AF_INET {
+		route.DstLen = 32
+	} else {
+		route.DstLen = 128
+	}
+	route.DstAddr = addr
+	route.Flags |= linux.RTM_F_CLONED // This route is cloned.
+	return route, nil
+}
+
+// parseForDestination parses a message as format of RouteMessage-RtAttr-dst.
+func parseForDestination(data []byte) ([]byte, *syserr.Error) {
+	var rtMsg linux.RouteMessage
+	if len(data) < linux.SizeOfRouteMessage {
+		return nil, syserr.ErrInvalidArgument
+	}
+	binary.Unmarshal(data[:linux.SizeOfRouteMessage], usermem.ByteOrder, &rtMsg)
+	// iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See
+	// commit bc234301af12. Note we don't check this flag for backward
+	// compatibility.
+	if rtMsg.Flags != 0 && rtMsg.Flags != linux.RTM_F_LOOKUP_TABLE {
+		return nil, syserr.ErrNotSupported
+	}
+
+	data = data[linux.SizeOfRouteMessage:]
+
+	// TODO(gvisor.dev/issue/1611): Add generic attribute parsing.
+	var rtAttr linux.RtAttr
+	if len(data) < linux.SizeOfRtAttr {
+		return nil, syserr.ErrInvalidArgument
+	}
+	binary.Unmarshal(data[:linux.SizeOfRtAttr], usermem.ByteOrder, &rtAttr)
+	if rtAttr.Type != linux.RTA_DST {
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	if len(data) < int(rtAttr.Len) {
+		return nil, syserr.ErrInvalidArgument
+	}
+	return data[linux.SizeOfRtAttr:rtAttr.Len], nil
+}
+
+// dumpRoutes handles RTM_GETROUTE requests.
 func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETROUTE dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
 
-	// We always send back an NLMSG_DONE.
-	ms.Multi = true
-
 	stack := inet.StackFromContext(ctx)
 	if stack == nil {
 		// No network routes.
 		return nil
 	}
 
-	for _, rt := range stack.RouteTable() {
+	routeTables := stack.RouteTable()
+
+	if hdr.Flags == linux.NLM_F_REQUEST {
+		dst, err := parseForDestination(data)
+		if err != nil {
+			return err
+		}
+		route, err := fillRoute(routeTables, dst)
+		if err != nil {
+			// TODO(gvisor.dev/issue/1237): return NLMSG_ERROR with ENETUNREACH.
+			return syserr.ErrNotSupported
+		}
+		routeTables = append([]inet.Route{}, route)
+	} else if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP {
+		// We always send back an NLMSG_DONE.
+		ms.Multi = true
+	} else {
+		// TODO(b/68878065): Only above cases are supported.
+		return syserr.ErrNotSupported
+	}
+
+	for _, rt := range routeTables {
 		m := ms.AddMessage(linux.NetlinkMessageHeader{
 			Type: linux.RTM_NEWROUTE,
 		})
@@ -236,12 +374,6 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH
 		}
 	}
 
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
-	}
-
 	switch hdr.Type {
 	case linux.RTM_GETLINK:
 		return p.dumpLinks(ctx, hdr, data, ms)
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index ef567f512..1e28e658d 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -442,6 +442,90 @@ TEST(NetlinkRouteTest, GetRouteDump) {
   EXPECT_TRUE(dstFound);
 }
 
+// GetRouteRequest tests a RTM_GETROUTE request with RTM_F_LOOKUP_TABLE flag.
+TEST(NetlinkRouteTest, GetRouteRequest) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+  uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+  struct __attribute__((__packed__)) request {
+    struct nlmsghdr hdr;
+    struct rtmsg rtm;
+    struct nlattr nla;
+    struct in_addr sin_addr;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETROUTE;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+
+  req.rtm.rtm_family = AF_INET;
+  req.rtm.rtm_dst_len = 32;
+  req.rtm.rtm_src_len = 0;
+  req.rtm.rtm_tos = 0;
+  req.rtm.rtm_table = RT_TABLE_UNSPEC;
+  req.rtm.rtm_protocol = RTPROT_UNSPEC;
+  req.rtm.rtm_scope = RT_SCOPE_UNIVERSE;
+  req.rtm.rtm_type = RTN_UNSPEC;
+  req.rtm.rtm_flags = RTM_F_LOOKUP_TABLE;
+
+  req.nla.nla_len = 8;
+  req.nla.nla_type = RTA_DST;
+  inet_aton("127.0.0.2", &req.sin_addr);
+
+  bool rtDstFound = false;
+  ASSERT_NO_ERRNO(NetlinkRequestResponseSingle(
+      fd, &req, sizeof(req), [&](const struct nlmsghdr* hdr) {
+        // Validate the reponse to RTM_GETROUTE request with RTM_F_LOOKUP_TABLE
+        // flag.
+        EXPECT_THAT(hdr->nlmsg_type, RTM_NEWROUTE);
+
+        EXPECT_TRUE(hdr->nlmsg_flags == 0) << std::hex << hdr->nlmsg_flags;
+
+        EXPECT_EQ(hdr->nlmsg_seq, kSeq);
+        EXPECT_EQ(hdr->nlmsg_pid, port);
+
+        // RTM_NEWROUTE contains at least the header and rtmsg.
+        ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct rtmsg)));
+        const struct rtmsg* msg =
+            reinterpret_cast<const struct rtmsg*>(NLMSG_DATA(hdr));
+
+        // NOTE: rtmsg fields are char fields.
+        std::cout << "Found route table=" << static_cast<int>(msg->rtm_table)
+                  << ", protocol=" << static_cast<int>(msg->rtm_protocol)
+                  << ", scope=" << static_cast<int>(msg->rtm_scope)
+                  << ", type=" << static_cast<int>(msg->rtm_type);
+
+        EXPECT_EQ(msg->rtm_family, AF_INET);
+        EXPECT_EQ(msg->rtm_dst_len, 32);
+        EXPECT_TRUE((msg->rtm_flags & RTM_F_CLONED) == RTM_F_CLONED)
+            << std::hex << msg->rtm_flags;
+
+        int len = RTM_PAYLOAD(hdr);
+        std::cout << ", len=" << len;
+        for (struct rtattr* attr = RTM_RTA(msg); RTA_OK(attr, len);
+             attr = RTA_NEXT(attr, len)) {
+          if (attr->rta_type == RTA_DST) {
+            char address[INET_ADDRSTRLEN] = {};
+            inet_ntop(AF_INET, RTA_DATA(attr), address, sizeof(address));
+            std::cout << ", dst=" << address;
+            rtDstFound = true;
+          } else if (attr->rta_type == RTA_OIF) {
+            const char* oif = reinterpret_cast<const char*>(RTA_DATA(attr));
+            std::cout << ", oif=" << oif;
+          }
+        }
+
+        std::cout << std::endl;
+      }));
+  // Found RTA_DST for RTM_F_LOOKUP_TABLE.
+  EXPECT_TRUE(rtDstFound);
+}
+
 // RecvmsgTrunc tests the recvmsg MSG_TRUNC flag with zero length output
 // buffer. MSG_TRUNC with a zero length buffer should consume subsequent
 // messages off the socket.
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index 723f5d728..cd2212a1a 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -108,5 +108,43 @@ PosixError NetlinkRequestResponse(
   return NoError();
 }
 
+PosixError NetlinkRequestResponseSingle(
+    const FileDescriptor& fd, void* request, size_t len,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct iovec iov = {};
+  iov.iov_base = request;
+  iov.iov_len = len;
+
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  // No destination required; it defaults to pid 0, the kernel.
+
+  RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(sendmsg)(fd.get(), &msg, 0));
+
+  constexpr size_t kBufferSize = 4096;
+  std::vector<char> buf(kBufferSize);
+  iov.iov_base = buf.data();
+  iov.iov_len = buf.size();
+
+  int ret;
+  RETURN_ERROR_IF_SYSCALL_FAIL(ret = RetryEINTR(recvmsg)(fd.get(), &msg, 0));
+
+  // We don't bother with the complexity of dealing with truncated messages.
+  // We must allocate a large enough buffer up front.
+  if ((msg.msg_flags & MSG_TRUNC) == MSG_TRUNC) {
+    return PosixError(
+        EIO,
+        absl::StrCat("Received truncated message with flags: ", msg.msg_flags));
+  }
+
+  for (struct nlmsghdr* hdr = reinterpret_cast<struct nlmsghdr*>(buf.data());
+       NLMSG_OK(hdr, ret); hdr = NLMSG_NEXT(hdr, ret)) {
+    fn(hdr);
+  }
+
+  return NoError();
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index 76e772c48..3678c0599 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -32,12 +32,21 @@ PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol);
 // Returns the port ID of the passed socket.
 PosixErrorOr<uint32_t> NetlinkPortID(int fd);
 
-// Send the passed request and call fn will all response netlink messages.
+// Send the passed request and call fn on all response netlink messages.
+//
+// To be used on requests with NLM_F_MULTI reponses.
 PosixError NetlinkRequestResponse(
     const FileDescriptor& fd, void* request, size_t len,
     const std::function<void(const struct nlmsghdr* hdr)>& fn,
     bool expect_nlmsgerr);
 
+// Send the passed request and call fn on all response netlink messages.
+//
+// To be used on requests without NLM_F_MULTI reponses.
+PosixError NetlinkRequestResponseSingle(
+    const FileDescriptor& fd, void* request, size_t len,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn);
+
 }  // namespace testing
 }  // namespace gvisor
 
-- 
cgit v1.2.3


From f263801a74d4ccac042b068d0928c8738e40af5b Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 28 Jan 2020 13:36:16 -0800
Subject: fs/splice: don't report partial errors for special files

Special files can have additional requirements for granularity.
For example, read from eventfd returns EINVAL if a size is less 8 bytes.

Reported-by: syzbot+3905f5493bec08eb7b02@syzkaller.appspotmail.com
PiperOrigin-RevId: 292002926
---
 pkg/sentry/fs/attr.go                   |  5 +++++
 pkg/sentry/fs/file.go                   |  7 -------
 pkg/sentry/fs/splice.go                 |  5 -----
 pkg/sentry/syscalls/linux/sys_splice.go | 19 +++++++++++++++----
 test/syscalls/linux/eventfd.cc          | 25 +++++++++++++++++++++++++
 5 files changed, 45 insertions(+), 16 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index fa9e7d517..f60bd423d 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -206,6 +206,11 @@ func IsPipe(s StableAttr) bool {
 	return s.Type == Pipe
 }
 
+// IsAnonymous returns true if StableAttr.Type matches any type of anonymous.
+func IsAnonymous(s StableAttr) bool {
+	return s.Type == Anonymous
+}
+
 // IsSocket returns true if StableAttr.Type matches any type of socket.
 func IsSocket(s StableAttr) bool {
 	return s.Type == Socket
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index ca3466f4f..78100e448 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -555,10 +555,6 @@ type lockedWriter struct {
 	//
 	// This applies only to Write, not WriteAt.
 	Offset int64
-
-	// Err contains the first error encountered while copying. This is
-	// useful to determine whether Writer or Reader failed during io.Copy.
-	Err error
 }
 
 // Write implements io.Writer.Write.
@@ -594,8 +590,5 @@ func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
 			break
 		}
 	}
-	if w.Err == nil {
-		w.Err = err
-	}
 	return written, err
 }
diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
index 791d1526c..33da82868 100644
--- a/pkg/sentry/fs/splice.go
+++ b/pkg/sentry/fs/splice.go
@@ -167,11 +167,6 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64,
 		if !srcPipe && !opts.SrcOffset {
 			atomic.StoreInt64(&src.offset, src.offset+n)
 		}
-
-		// Don't report any errors if we have some progress without data loss.
-		if w.Err == nil {
-			err = nil
-		}
 	}
 
 	// Drop locks.
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index dd3a5807f..f43d6c155 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -211,8 +211,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	opts := fs.SpliceOpts{
 		Length: count,
 	}
+	inFileAttr := inFile.Dirent.Inode.StableAttr
+	outFileAttr := outFile.Dirent.Inode.StableAttr
 	switch {
-	case fs.IsPipe(inFile.Dirent.Inode.StableAttr) && !fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+	case fs.IsPipe(inFileAttr) && !fs.IsPipe(outFileAttr):
 		if inOffset != 0 {
 			return 0, nil, syserror.ESPIPE
 		}
@@ -229,7 +231,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			opts.DstOffset = true
 			opts.DstStart = offset
 		}
-	case !fs.IsPipe(inFile.Dirent.Inode.StableAttr) && fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+	case !fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr):
 		if outOffset != 0 {
 			return 0, nil, syserror.ESPIPE
 		}
@@ -246,13 +248,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			opts.SrcOffset = true
 			opts.SrcStart = offset
 		}
-	case fs.IsPipe(inFile.Dirent.Inode.StableAttr) && fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+	case fs.IsPipe(inFileAttr) && fs.IsPipe(outFileAttr):
 		if inOffset != 0 || outOffset != 0 {
 			return 0, nil, syserror.ESPIPE
 		}
 
 		// We may not refer to the same pipe; otherwise it's a continuous loop.
-		if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID {
+		if inFileAttr.InodeID == outFileAttr.InodeID {
 			return 0, nil, syserror.EINVAL
 		}
 	default:
@@ -262,6 +264,15 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	// Splice data.
 	n, err := doSplice(t, outFile, inFile, opts, nonBlock)
 
+	// Special files can have additional requirements for granularity.  For
+	// example, read from eventfd returns EINVAL if a size is less 8 bytes.
+	// Inotify is another example. read will return EINVAL is a buffer is
+	// too small to return the next event, but a size of an event isn't
+	// fixed, it is sizeof(struct inotify_event) + {NAME_LEN} + 1.
+	if n != 0 && err != nil && (fs.IsAnonymous(inFileAttr) || fs.IsAnonymous(outFileAttr)) {
+		err = nil
+	}
+
 	// See above; inFile is chosen arbitrarily here.
 	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile)
 }
diff --git a/test/syscalls/linux/eventfd.cc b/test/syscalls/linux/eventfd.cc
index 367682c3d..927001eee 100644
--- a/test/syscalls/linux/eventfd.cc
+++ b/test/syscalls/linux/eventfd.cc
@@ -132,6 +132,31 @@ TEST(EventfdTest, BigWriteBigRead) {
   EXPECT_EQ(l[0], 1);
 }
 
+TEST(EventfdTest, SpliceFromPipePartialSucceeds) {
+  int pipes[2];
+  ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
+  const FileDescriptor pipe_rfd(pipes[0]);
+  const FileDescriptor pipe_wfd(pipes[1]);
+  constexpr uint64_t kVal{1};
+
+  FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK));
+
+  uint64_t event_array[2];
+  event_array[0] = kVal;
+  event_array[1] = kVal;
+  ASSERT_THAT(write(pipe_wfd.get(), event_array, sizeof(event_array)),
+              SyscallSucceedsWithValue(sizeof(event_array)));
+  EXPECT_THAT(splice(pipe_rfd.get(), /*__offin=*/nullptr, efd.get(),
+                     /*__offout=*/nullptr, sizeof(event_array[0]) + 1,
+                     SPLICE_F_NONBLOCK),
+              SyscallSucceedsWithValue(sizeof(event_array[0])));
+
+  uint64_t val;
+  ASSERT_THAT(read(efd.get(), &val, sizeof(val)),
+              SyscallSucceedsWithValue(sizeof(val)));
+  EXPECT_EQ(val, kVal);
+}
+
 // NotifyNonZero is inherently racy, so random save is disabled.
 TEST(EventfdTest, NotifyNonZero_NoRandomSave) {
   // Waits will time out at 10 seconds.
-- 
cgit v1.2.3


From 51b783505b1ec164b02b48a0fd234509fba01a73 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 29 Jan 2020 15:41:51 -0800
Subject: Add support for TCP_DEFER_ACCEPT.

PiperOrigin-RevId: 292233574
---
 pkg/sentry/socket/netstack/netstack.go      |  22 ++++
 pkg/tcpip/tcpip.go                          |   6 ++
 pkg/tcpip/transport/tcp/BUILD               |   1 +
 pkg/tcpip/transport/tcp/accept.go           |  25 ++---
 pkg/tcpip/transport/tcp/connect.go          |  53 +++++++++-
 pkg/tcpip/transport/tcp/endpoint.go         |  26 ++++-
 pkg/tcpip/transport/tcp/forwarder.go        |   4 +-
 pkg/tcpip/transport/tcp/tcp_test.go         | 126 ++++++++++++++++++++++
 test/syscalls/linux/socket_inet_loopback.cc | 158 ++++++++++++++++++++++++++++
 test/syscalls/linux/tcp_socket.cc           |  53 ++++++++++
 10 files changed, 451 insertions(+), 23 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 8619cc506..049d04bf2 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1260,6 +1260,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(time.Duration(v) / time.Second), nil
 
+	case linux.TCP_DEFER_ACCEPT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.TCPDeferAcceptOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Second), nil
+
 	default:
 		emitUnimplementedEventTCP(t, name)
 	}
@@ -1713,6 +1725,16 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_DEFER_ACCEPT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+		v := int32(usermem.ByteOrder.Uint32(optVal))
+		if v < 0 {
+			v = 0
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))))
+
 	case linux.TCP_REPAIR_OPTIONS:
 		t.Kernel().EmitUnimplementedEvent(t)
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 59c9b3fb0..0fa141d58 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -626,6 +626,12 @@ type TCPLingerTimeoutOption time.Duration
 // before being marked closed.
 type TCPTimeWaitTimeoutOption time.Duration
 
+// TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
+// accept to return a completed connection only when there is data to be
+// read. This usually means the listening socket will drop the final ACK
+// for a handshake till the specified timeout until a segment with data arrives.
+type TCPDeferAcceptOption time.Duration
+
 // MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
 // TTL value for multicast messages. The default is 1.
 type MulticastTTLOption uint8
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 4acd9fb9a..7b4a87a2d 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -57,6 +57,7 @@ go_library(
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index d469758eb..6101f2945 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -222,13 +222,13 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
 
 // createConnectingEndpoint creates a new endpoint in a connecting state, with
 // the connection parameters given by the arguments.
-func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
+func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create a new endpoint.
 	netProto := l.netProto
 	if netProto == 0 {
 		netProto = s.route.NetProto
 	}
-	n := newEndpoint(l.stack, netProto, nil)
+	n := newEndpoint(l.stack, netProto, queue)
 	n.v6only = l.v6only
 	n.ID = s.id
 	n.boundNICID = s.route.NICID()
@@ -273,16 +273,17 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 
 // createEndpoint creates a new endpoint in connected state and then performs
 // the TCP 3-way handshake.
-func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
+func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
 	// Create new endpoint.
 	irs := s.sequenceNumber
 	isn := generateSecureISN(s.id, l.stack.Seed())
-	ep, err := l.createConnectingEndpoint(s, isn, irs, opts)
+	ep, err := l.createConnectingEndpoint(s, isn, irs, opts, queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// listenEP is nil when listenContext is used by tcp.Forwarder.
+	deferAccept := time.Duration(0)
 	if l.listenEP != nil {
 		l.listenEP.mu.Lock()
 		if l.listenEP.EndpointState() != StateListen {
@@ -290,13 +291,12 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 			return nil, tcpip.ErrConnectionAborted
 		}
 		l.addPendingEndpoint(ep)
+		deferAccept = l.listenEP.deferAccept
 		l.listenEP.mu.Unlock()
 	}
 
 	// Perform the 3-way handshake.
-	h := newHandshake(ep, seqnum.Size(ep.initialReceiveWindow()))
-
-	h.resetToSynRcvd(isn, irs, opts)
+	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
 		ep.Close()
 		if l.listenEP != nil {
@@ -377,16 +377,14 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 	defer e.decSynRcvdCount()
 	defer s.decRef()
 
-	n, err := ctx.createEndpointAndPerformHandshake(s, opts)
+	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{})
 	if err != nil {
 		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		e.stats.FailedConnectionAttempts.Increment()
 		return
 	}
 	ctx.removePendingEndpoint(n)
-	// Start the protocol goroutine.
-	wq := &waiter.Queue{}
-	n.startAcceptedLoop(wq)
+	n.startAcceptedLoop()
 	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 
 	e.deliverAccepted(n)
@@ -546,7 +544,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
 		}
 
-		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions)
+		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions, &waiter.Queue{})
 		if err != nil {
 			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 			e.stats.FailedConnectionAttempts.Increment()
@@ -576,8 +574,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// space available in the backlog.
 
 		// Start the protocol goroutine.
-		wq := &waiter.Queue{}
-		n.startAcceptedLoop(wq)
+		n.startAcceptedLoop()
 		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 		go e.deliverAccepted(n)
 	}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4e3c5419c..9ff7ac261 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -86,6 +86,19 @@ type handshake struct {
 
 	// rcvWndScale is the receive window scale, as defined in RFC 1323.
 	rcvWndScale int
+
+	// startTime is the time at which the first SYN/SYN-ACK was sent.
+	startTime time.Time
+
+	// deferAccept if non-zero will drop the final ACK for a passive
+	// handshake till an ACK segment with data is received or the timeout is
+	// hit.
+	deferAccept time.Duration
+
+	// acked is true if the the final ACK for a 3-way handshake has
+	// been received. This is required to stop retransmitting the
+	// original SYN-ACK when deferAccept is enabled.
+	acked bool
 }
 
 func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
@@ -112,6 +125,12 @@ func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
 	return h
 }
 
+func newPassiveHandshake(ep *endpoint, rcvWnd seqnum.Size, isn, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) handshake {
+	h := newHandshake(ep, rcvWnd)
+	h.resetToSynRcvd(isn, irs, opts, deferAccept)
+	return h
+}
+
 // FindWndScale determines the window scale to use for the given maximum window
 // size.
 func FindWndScale(wnd seqnum.Size) int {
@@ -181,7 +200,7 @@ func (h *handshake) effectiveRcvWndScale() uint8 {
 
 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
 // state.
-func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) {
+func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) {
 	h.active = false
 	h.state = handshakeSynRcvd
 	h.flags = header.TCPFlagSyn | header.TCPFlagAck
@@ -189,6 +208,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.ackNum = irs + 1
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
+	h.deferAccept = deferAccept
 	h.ep.mu.Lock()
 	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
@@ -352,6 +372,14 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 	// We have previously received (and acknowledged) the peer's SYN. If the
 	// peer acknowledges our SYN, the handshake is completed.
 	if s.flagIsSet(header.TCPFlagAck) {
+		// If deferAccept is not zero and this is a bare ACK and the
+		// timeout is not hit then drop the ACK.
+		if h.deferAccept != 0 && s.data.Size() == 0 && time.Since(h.startTime) < h.deferAccept {
+			h.acked = true
+			h.ep.stack.Stats().DroppedPackets.Increment()
+			return nil
+		}
+
 		// If the timestamp option is negotiated and the segment does
 		// not carry a timestamp option then the segment must be dropped
 		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
@@ -365,10 +393,16 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
 		}
 		h.state = handshakeCompleted
+
 		h.ep.mu.Lock()
 		h.ep.transitionToStateEstablishedLocked(h)
+		// If the segment has data then requeue it for the receiver
+		// to process it again once main loop is started.
+		if s.data.Size() > 0 {
+			s.incRef()
+			h.ep.enqueueSegment(s)
+		}
 		h.ep.mu.Unlock()
-
 		return nil
 	}
 
@@ -471,6 +505,7 @@ func (h *handshake) execute() *tcpip.Error {
 		}
 	}
 
+	h.startTime = time.Now()
 	// Initialize the resend timer.
 	resendWaker := sleep.Waker{}
 	timeOut := time.Duration(time.Second)
@@ -524,11 +559,21 @@ func (h *handshake) execute() *tcpip.Error {
 		switch index, _ := s.Fetch(true); index {
 		case wakerForResend:
 			timeOut *= 2
-			if timeOut > 60*time.Second {
+			if timeOut > MaxRTO {
 				return tcpip.ErrTimeout
 			}
 			rt.Reset(timeOut)
-			h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+			// Resend the SYN/SYN-ACK only if the following conditions hold.
+			//  - It's an active handshake (deferAccept does not apply)
+			//  - It's a passive handshake and we have not yet got the final-ACK.
+			//  - It's a passive handshake and we got an ACK but deferAccept is
+			//    enabled and we are now past the deferAccept duration.
+			// The last is required to provide a way for the peer to complete
+			// the connection with another ACK or data (as ACKs are never
+			// retransmitted on their own).
+			if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept {
+				h.ep.sendSynTCP(&h.ep.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
+			}
 
 		case wakerForNotification:
 			n := h.ep.fetchNotifications()
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 13718ff55..8d52414b7 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -498,6 +498,13 @@ type endpoint struct {
 	// without any data being acked.
 	userTimeout time.Duration
 
+	// deferAccept if non-zero specifies a user specified time during
+	// which the final ACK of a handshake will be dropped provided the
+	// ACK is a bare ACK and carries no data. If the timeout is crossed then
+	// the bare ACK is accepted and the connection is delivered to the
+	// listener.
+	deferAccept time.Duration
+
 	// pendingAccepted is a synchronization primitive used to track number
 	// of connections that are queued up to be delivered to the accepted
 	// channel. We use this to ensure that all goroutines blocked on writing
@@ -1574,6 +1581,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.TCPDeferAcceptOption:
+		e.mu.Lock()
+		if time.Duration(v) > MaxRTO {
+			v = tcpip.TCPDeferAcceptOption(MaxRTO)
+		}
+		e.deferAccept = time.Duration(v)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return nil
 	}
@@ -1798,6 +1814,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case *tcpip.TCPDeferAcceptOption:
+		e.mu.Lock()
+		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -2149,9 +2171,8 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
-func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
+func (e *endpoint) startAcceptedLoop() {
 	e.mu.Lock()
-	e.waiterQueue = waiterQueue
 	e.workerRunning = true
 	e.mu.Unlock()
 	wakerInitDone := make(chan struct{})
@@ -2177,7 +2198,6 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	default:
 		return nil, nil, tcpip.ErrWouldBlock
 	}
-
 	return n, n.waiterQueue, nil
 }
 
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 7eb613be5..c9ee5bf06 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -157,13 +157,13 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 		TSVal:         r.synOptions.TSVal,
 		TSEcr:         r.synOptions.TSEcr,
 		SACKPermitted: r.synOptions.SACKPermitted,
-	})
+	}, queue)
 	if err != nil {
 		return nil, err
 	}
 
 	// Start the protocol goroutine.
-	ep.startAcceptedLoop(queue)
+	ep.startAcceptedLoop()
 
 	return ep, nil
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index df2fb1071..a12336d47 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -6787,3 +6787,129 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
 		),
 	)
 }
+
+func TestTCPDeferAccept(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give a bit of time for the socket to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
+
+func TestTCPDeferAcceptTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Sleep for a little of the tcpDeferAccept timeout.
+	time.Sleep(tcpDeferAccept + 100*time.Millisecond)
+
+	// On timeout expiry we should get a SYN-ACK retransmission.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+		checker.AckNum(uint32(irs)+1)))
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give sometime for the endpoint to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 2f9821555..3bf7081b9 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -828,6 +828,164 @@ TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
   EXPECT_EQ(get, kUserTimeout);
 }
 
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAccept_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+  // saved. Enable S/R issue is fixed.
+  DisableSave ds;
+
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the TCP_DEFER_ACCEPT on the listening socket.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Set the listening socket to nonblock so that we can verify that there is no
+  // connection in queue despite the connect above succeeding since the peer has
+  // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+  // FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Set FD back to blocking.
+  opts &= ~O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Now write some data to the socket.
+  int data = 0;
+  ASSERT_THAT(RetryEINTR(write)(conn_fd.get(), &data, sizeof(data)),
+              SyscallSucceedsWithValue(sizeof(data)));
+
+  // This should now cause the connection to complete and be delivered to the
+  // accept socket.
+
+  // Accept the connection.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+  // Verify that the accepted socket returns the data written.
+  int get = -1;
+  ASSERT_THAT(RetryEINTR(recv)(accepted.get(), &get, sizeof(get), 0),
+              SyscallSucceedsWithValue(sizeof(get)));
+
+  EXPECT_EQ(get, data);
+}
+
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAcceptTimeout_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+  // saved. Enable S/R once issue is fixed.
+  DisableSave ds;
+
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Set the TCP_DEFER_ACCEPT on the listening socket.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+
+  // Connect to the listening socket.
+  FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&conn_addr),
+                                  connector.addr_len),
+              SyscallSucceeds());
+
+  // Set the listening socket to nonblock so that we can verify that there is no
+  // connection in queue despite the connect above succeeding since the peer has
+  // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+  // FD to O_NONBLOCK.
+  int opts;
+  ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+  opts |= O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Verify that there is no acceptable connection before TCP_DEFER_ACCEPT
+  // timeout is hit.
+  absl::SleepFor(absl::Seconds(kTCPDeferAccept - 1));
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+
+  // Set FD back to blocking.
+  opts &= ~O_NONBLOCK;
+  ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+  // Now sleep for a little over the TCP_DEFER_ACCEPT duration. When the timeout
+  // is hit a SYN-ACK should be retransmitted by the listener as a last ditch
+  // attempt to complete the connection with or without data.
+  absl::SleepFor(absl::Seconds(2));
+
+  // Verify that we have a connection that can be accepted even though no
+  // data was written.
+  auto accepted =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     All, SocketInetLoopbackTest,
     ::testing::Values(
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 33a5ac66c..525ccbd88 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1286,6 +1286,59 @@ TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) {
   EXPECT_EQ(get, kTCPUserTimeout);
 }
 
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptNeg) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  // -ve TCP_DEFER_ACCEPT is same as setting it to zero.
+  constexpr int kNeg = -1;
+  EXPECT_THAT(
+      setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &kNeg, sizeof(kNeg)),
+      SyscallSucceeds());
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, GetTCPDeferAcceptDefault) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) {
+  FileDescriptor s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  // kTCPDeferAccept is in seconds.
+  // NOTE: linux translates seconds to # of retries and back from
+  //   #of retries to seconds. Which means only certain values
+  //   translate back exactly. That's why we use 3 here, a value of
+  //   5 will result in us getting back 7 instead of 5 in the
+  //   getsockopt.
+  constexpr int kTCPDeferAccept = 3;
+  ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+                         &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+              SyscallSucceeds());
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len),
+      SyscallSucceeds());
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kTCPDeferAccept);
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From ede8dfab3760afc8063c3418f217e52f7ec70d42 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 30 Jan 2020 09:13:36 -0800
Subject: Enforce splice offset limits

Splice must not allow negative offsets. Writes also must not allow offset +
size to overflow int64. Reads are similarly broken, but not just in splice
(b/148095030).

Reported-by: syzbot+0e1ff0b95fb2859b4190@syzkaller.appspotmail.com
PiperOrigin-RevId: 292361208
---
 pkg/sentry/fs/tmpfs/inode_file.go       | 10 ++++--
 pkg/sentry/syscalls/linux/sys_splice.go | 16 ++++------
 test/syscalls/linux/splice.cc           | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 12 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index dabc10662..25abbc151 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -17,6 +17,7 @@ package tmpfs
 import (
 	"fmt"
 	"io"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -444,10 +445,15 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 	defer rw.f.dataMu.Unlock()
 
 	// Compute the range to write.
-	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
-	if end == rw.offset { // srcs.NumBytes() == 0?
+	if srcs.NumBytes() == 0 {
+		// Nothing to do.
 		return 0, nil
 	}
+	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
+	if end == math.MaxInt64 {
+		// Overflow.
+		return 0, syserror.EINVAL
+	}
 
 	// Check if seals prevent either file growth or all writes.
 	switch {
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index f43d6c155..fd642834b 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -25,6 +25,10 @@ import (
 
 // doSplice implements a blocking splice operation.
 func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
+	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 {
+		return 0, syserror.EINVAL
+	}
+
 	var (
 		total int64
 		n     int64
@@ -82,11 +86,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	offsetAddr := args[2].Pointer()
 	count := int64(args[3].SizeT())
 
-	// Don't send a negative number of bytes.
-	if count < 0 {
-		return 0, nil, syserror.EINVAL
-	}
-
 	// Get files.
 	inFile := t.GetFile(inFD)
 	if inFile == nil {
@@ -136,11 +135,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			return 0, nil, err
 		}
 
-		// The offset must be valid.
-		if offset < 0 {
-			return 0, nil, syserror.EINVAL
-		}
-
 		// Do the splice.
 		n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
 			Length:    count,
@@ -227,6 +221,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			if _, err := t.CopyIn(outOffset, &offset); err != nil {
 				return 0, nil, err
 			}
+
 			// Use the destination offset.
 			opts.DstOffset = true
 			opts.DstStart = offset
@@ -244,6 +239,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 			if _, err := t.CopyIn(inOffset, &offset); err != nil {
 				return 0, nil, err
 			}
+
 			// Use the source offset.
 			opts.SrcOffset = true
 			opts.SrcStart = offset
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index 85232cb1f..faa1247f6 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -60,6 +60,62 @@ TEST(SpliceTest, TwoRegularFiles) {
               SyscallFailsWithErrno(EINVAL));
 }
 
+int memfd_create(const std::string& name, unsigned int flags) {
+  return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST(SpliceTest, NegativeOffset) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill the pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Open the output file as write only.
+  int fd;
+  EXPECT_THAT(fd = memfd_create("negative", 0), SyscallSucceeds());
+  const FileDescriptor out_fd(fd);
+
+  loff_t out_offset = 0xffffffffffffffffull;
+  constexpr int kSize = 2;
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Write offset + size overflows int64.
+//
+// This is a regression test for b/148041624.
+TEST(SpliceTest, WriteOverflow) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill the pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Open the output file.
+  int fd;
+  EXPECT_THAT(fd = memfd_create("overflow", 0), SyscallSucceeds());
+  const FileDescriptor out_fd(fd);
+
+  // out_offset + kSize overflows INT64_MAX.
+  loff_t out_offset = 0x7ffffffffffffffeull;
+  constexpr int kSize = 3;
+  EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(SpliceTest, SamePipe) {
   // Create a new pipe.
   int fds[2];
-- 
cgit v1.2.3


From 4ee64a248ec16fcc9e526a457a66648546611bfb Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 30 Jan 2020 11:48:36 -0800
Subject: Fix for panic in endpoint.Close().

When sending a RST on shutdown we need to double check the
state after acquiring the work mutex as the endpoint could
have transitioned out of a connected state from the time
we checked it and we acquired the workMutex.

I added two tests but sadly neither reproduce the panic. I am
going to leave the tests in as they are good to have anyway.

PiperOrigin-RevId: 292393800
---
 pkg/tcpip/transport/tcp/BUILD                |  1 +
 pkg/tcpip/transport/tcp/endpoint.go          | 10 ++++-
 pkg/tcpip/transport/tcp/tcp_test.go          | 55 ++++++++++++++++++++++++++++
 test/syscalls/linux/BUILD                    |  1 +
 test/syscalls/linux/socket_ip_tcp_generic.cc | 33 +++++++++++++++++
 5 files changed, 98 insertions(+), 2 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 7b4a87a2d..272e8f570 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -91,6 +91,7 @@ go_test(
     tags = ["flaky"],
     deps = [
         ":tcp",
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/checker",
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 8d52414b7..b5a8e15ee 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2047,8 +2047,14 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 				// work mutex is available.
 				if e.workMu.TryLock() {
 					e.mu.Lock()
-					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
-					e.notifyProtocolGoroutine(notifyTickleWorker)
+					// We need to double check here to make
+					// sure worker has not transitioned the
+					// endpoint out of a connected state
+					// before trying to send a reset.
+					if e.EndpointState().connected() {
+						e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+						e.notifyProtocolGoroutine(notifyTickleWorker)
+					}
 					e.mu.Unlock()
 					e.workMu.Unlock()
 				} else {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index a12336d47..2c1505067 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -21,6 +21,7 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
@@ -6913,3 +6914,57 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 		checker.SeqNum(uint32(iss+1)),
 		checker.AckNum(uint32(irs+5))))
 }
+
+func TestResetDuringClose(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	iss := seqnum.Value(789)
+	c.CreateConnected(iss, 30000, -1 /* epRecvBuf */)
+	// Send some data to make sure there is some unread
+	// data to trigger a reset on c.Close.
+	irs := c.IRS
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss.Add(1),
+		AckNum:  irs.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(irs.Add(1))),
+		checker.AckNum(uint32(iss.Add(5)))))
+
+	// Close in a separate goroutine so that we can trigger
+	// a race with the RST we send below. This should not
+	// panic due to the route being released depeding on
+	// whether Close() sends an active RST or the RST sent
+	// below is processed by the worker first.
+	var wg sync.WaitGroup
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		c.SendPacket(nil, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			SeqNum:  iss.Add(5),
+			AckNum:  c.IRS.Add(5),
+			RcvWnd:  30000,
+			Flags:   header.TCPFlagRst,
+		})
+	}()
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		c.EP.Close()
+	}()
+
+	wg.Wait()
+}
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 74bf068ec..7958fd0d7 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2173,6 +2173,7 @@ cc_library(
         ":socket_test_util",
         "//test/util:test_util",
         "//test/util:thread_util",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 57ce8e169..27779e47c 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -24,6 +24,7 @@
 #include <sys/un.h>
 
 #include "gtest/gtest.h"
+#include "absl/memory/memory.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
@@ -875,5 +876,37 @@ TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) {
   EXPECT_EQ(get, kAbove);
 }
 
+TEST_P(TCPSocketPairTest, TCPResetDuringClose_NoRandomSave) {
+  DisableSave ds;  // Too many syscalls.
+  constexpr int kThreadCount = 1000;
+  std::unique_ptr<ScopedThread> instances[kThreadCount];
+  for (int i = 0; i < kThreadCount; i++) {
+    instances[i] = absl::make_unique<ScopedThread>([&]() {
+      auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+      ScopedThread t([&]() {
+        // Close one end to trigger sending of a FIN.
+        struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+        // Wait up to 20 seconds for the data.
+        constexpr int kPollTimeoutMs = 20000;
+        ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+                    SyscallSucceedsWithValue(1));
+        ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+      });
+
+      // Send some data then close.
+      constexpr char kStr[] = "abc";
+      ASSERT_THAT(write(sockets->first_fd(), kStr, 3),
+                  SyscallSucceedsWithValue(3));
+      absl::SleepFor(absl::Milliseconds(10));
+      ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+      t.Join();
+    });
+  }
+  for (int i = 0; i < kThreadCount; i++) {
+    instances[i]->Join();
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 9988cf2eeff596ce519046d80c54d09166f7d84b Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Thu, 30 Jan 2020 14:06:54 -0800
Subject: Wrap all GetSocketPairs() in unnamed namespaces

This avoids conflicting definitions of GetSocketPairs() in outer namespace when
multiple such cc files are complied for one binary.

PiperOrigin-RevId: 292420885
---
 test/syscalls/linux/socket_abstract.cc                       | 2 ++
 test/syscalls/linux/socket_filesystem.cc                     | 2 ++
 test/syscalls/linux/socket_ip_tcp_generic_loopback.cc        | 2 ++
 test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc       | 2 ++
 test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc       | 2 ++
 test/syscalls/linux/socket_ip_udp_loopback.cc                | 2 ++
 test/syscalls/linux/socket_ip_udp_loopback_blocking.cc       | 2 ++
 test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc       | 2 ++
 test/syscalls/linux/socket_unix_abstract_nonblock.cc         | 2 ++
 test/syscalls/linux/socket_unix_blocking_local.cc            | 2 ++
 test/syscalls/linux/socket_unix_dgram_local.cc               | 2 ++
 test/syscalls/linux/socket_unix_domain.cc                    | 2 ++
 test/syscalls/linux/socket_unix_filesystem_nonblock.cc       | 2 ++
 test/syscalls/linux/socket_unix_non_stream_blocking_local.cc | 2 ++
 test/syscalls/linux/socket_unix_pair.cc                      | 2 ++
 test/syscalls/linux/socket_unix_pair_nonblock.cc             | 2 ++
 test/syscalls/linux/socket_unix_seqpacket_local.cc           | 2 ++
 test/syscalls/linux/socket_unix_stream_blocking_local.cc     | 2 ++
 test/syscalls/linux/socket_unix_stream_local.cc              | 2 ++
 test/syscalls/linux/socket_unix_stream_nonblock_local.cc     | 2 ++
 20 files changed, 40 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_abstract.cc b/test/syscalls/linux/socket_abstract.cc
index 715d87b76..00999f192 100644
--- a/test/syscalls/linux/socket_abstract.cc
+++ b/test/syscalls/linux/socket_abstract.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -43,5 +44,6 @@ INSTANTIATE_TEST_SUITE_P(
     AbstractUnixSockets, UnixSocketPairCmsgTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_filesystem.cc b/test/syscalls/linux/socket_filesystem.cc
index 74e262959..287359363 100644
--- a/test/syscalls/linux/socket_filesystem.cc
+++ b/test/syscalls/linux/socket_filesystem.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -43,5 +44,6 @@ INSTANTIATE_TEST_SUITE_P(
     FilesystemUnixSockets, UnixSocketPairCmsgTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
index d11f7cc23..4e79d21f4 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVecToVec<SocketPairKind>(
@@ -39,5 +40,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllTCPSockets, TCPSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
index fcd20102f..f996b93d2 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVecToVec<SocketPairKind>(
@@ -39,5 +40,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingTCPSockets, BlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
index 63a05b799..ffa377210 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVecToVec<SocketPairKind>(
@@ -38,5 +39,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingTCPSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback.cc b/test/syscalls/linux/socket_ip_udp_loopback.cc
index 1df74a348..c7fa44884 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -44,5 +45,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllUDPSockets, UDPSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
index 1e259efa7..d6925a8df 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingUDPSockets, BlockingNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
index 74cbd326d..d675eddc6 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingUDPSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_abstract_nonblock.cc b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
index be31ab2a7..8bef76b67 100644
--- a/test/syscalls/linux/socket_unix_abstract_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingAbstractUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc
index 6f84221b2..77cb8c6d6 100644
--- a/test/syscalls/linux/socket_unix_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_blocking_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(
@@ -39,5 +40,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingUnixDomainSockets, BlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_dgram_local.cc b/test/syscalls/linux/socket_unix_dgram_local.cc
index 9134fcdf7..31d2d5216 100644
--- a/test/syscalls/linux/socket_unix_dgram_local.cc
+++ b/test/syscalls/linux/socket_unix_dgram_local.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(VecCat<SocketPairKind>(
@@ -52,5 +53,6 @@ INSTANTIATE_TEST_SUITE_P(
     DgramUnixSockets, NonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_domain.cc b/test/syscalls/linux/socket_unix_domain.cc
index fa3efc7f8..f7dff8b4d 100644
--- a/test/syscalls/linux/socket_unix_domain.cc
+++ b/test/syscalls/linux/socket_unix_domain.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, AllSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
index 8ba7af971..6700b4d90 100644
--- a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingFilesystemUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
index 8855d5001..fddcdf1c5 100644
--- a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(
@@ -36,5 +37,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingNonStreamUnixSockets, BlockingNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair.cc b/test/syscalls/linux/socket_unix_pair.cc
index 411fb4518..85999db04 100644
--- a/test/syscalls/linux/socket_unix_pair.cc
+++ b/test/syscalls/linux/socket_unix_pair.cc
@@ -22,6 +22,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
@@ -38,5 +39,6 @@ INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, UnixSocketPairCmsgTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair_nonblock.cc b/test/syscalls/linux/socket_unix_pair_nonblock.cc
index 3135d325f..281410a9a 100644
--- a/test/syscalls/linux/socket_unix_pair_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_pair_nonblock.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return ApplyVec<SocketPairKind>(
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingUnixSockets, NonBlockingSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_seqpacket_local.cc b/test/syscalls/linux/socket_unix_seqpacket_local.cc
index dff75a532..69a5f150d 100644
--- a/test/syscalls/linux/socket_unix_seqpacket_local.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket_local.cc
@@ -23,6 +23,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(VecCat<SocketPairKind>(
@@ -52,5 +53,6 @@ INSTANTIATE_TEST_SUITE_P(
     SeqpacketUnixSockets, UnixNonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
index 08e579ba7..8429bd429 100644
--- a/test/syscalls/linux/socket_unix_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -34,5 +35,6 @@ INSTANTIATE_TEST_SUITE_P(
     BlockingStreamUnixSockets, BlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_local.cc b/test/syscalls/linux/socket_unix_stream_local.cc
index 65eef1a81..a7e3449a9 100644
--- a/test/syscalls/linux/socket_unix_stream_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_local.cc
@@ -21,6 +21,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return VecCat<SocketPairKind>(
@@ -42,5 +43,6 @@ INSTANTIATE_TEST_SUITE_P(
     StreamUnixSockets, StreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
index 1936aa135..4b763c8e2 100644
--- a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
@@ -20,6 +20,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketPairKind> GetSocketPairs() {
   return {
@@ -33,5 +34,6 @@ INSTANTIATE_TEST_SUITE_P(
     NonBlockingStreamUnixSockets, NonBlockingStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 14959250feb71df74dea13f3cb15dcbe8ce6b3f3 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 30 Jan 2020 17:37:17 -0800
Subject: Simplify testing link rules.

PiperOrigin-RevId: 292458933
---
 test/syscalls/linux/BUILD | 688 +++++++++++++++++++++++-----------------------
 test/util/BUILD           |  30 +-
 tools/build/defs.bzl      |   1 +
 tools/defs.bzl            |   3 +-
 tools/images/BUILD        |   4 +-
 5 files changed, 363 insertions(+), 363 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ee7a8a673..e4ca5b6db 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "select_arch", "select_system")
+load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "gtest", "select_arch", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -82,14 +82,14 @@ cc_library(
     srcs = ["base_poll_test.cc"],
     hdrs = ["base_poll_test.h"],
     deps = [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -99,11 +99,11 @@ cc_library(
     hdrs = ["file_base.h"],
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -130,7 +130,7 @@ cc_library(
     hdrs = ["socket_test_util.h"],
     defines = select_system(),
     deps = default_net_util() + [
-        "@com_google_googletest//:gtest",
+        gtest,
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -155,9 +155,9 @@ cc_library(
     hdrs = ["unix_domain_socket_test_util.h"],
     deps = [
         ":socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
 )
 
@@ -179,14 +179,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -199,13 +199,13 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:memory_util",
         "//test/util:platform_util",
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -218,9 +218,9 @@ cc_binary(
         ":socket_test_util",
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -233,9 +233,9 @@ cc_binary(
         ":socket_test_util",
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -247,10 +247,10 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -262,12 +262,12 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -280,12 +280,11 @@ cc_binary(
     ],
     linkstatic = 1,
     deps = [
-        # The heapchecker doesn't recognize that io_destroy munmaps.
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:proc_util",
@@ -302,12 +301,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -320,9 +319,9 @@ cc_binary(
         "//:sandbox",
     ],
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -334,9 +333,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -348,9 +347,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -372,10 +371,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -388,10 +387,10 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -404,14 +403,14 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/synchronization",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -424,12 +423,12 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/flags:flag",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -443,12 +442,12 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:mount_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -458,9 +457,9 @@ cc_binary(
     srcs = ["clock_getres.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -470,11 +469,11 @@ cc_binary(
     srcs = ["clock_gettime.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -484,13 +483,13 @@ cc_binary(
     srcs = ["concurrency.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:platform_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -503,9 +502,9 @@ cc_binary(
         ":socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -516,10 +515,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -530,9 +529,9 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -544,11 +543,11 @@ cc_binary(
     deps = [
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -561,10 +560,10 @@ cc_binary(
         "//test/util:epoll_util",
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -576,10 +575,10 @@ cc_binary(
     deps = [
         "//test/util:epoll_util",
         "//test/util:eventfd_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -589,12 +588,12 @@ cc_binary(
     srcs = ["exceptions.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:platform_util",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -604,10 +603,10 @@ cc_binary(
     srcs = ["getcpu.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -617,10 +616,10 @@ cc_binary(
     srcs = ["getcpu.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -630,13 +629,13 @@ cc_binary(
     srcs = ["getrusage.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -652,14 +651,14 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:proc_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -682,15 +681,15 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:optional",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -701,11 +700,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -718,10 +717,10 @@ cc_binary(
         ":file_base",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -731,9 +730,9 @@ cc_binary(
     srcs = ["fault.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -744,10 +743,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -761,18 +760,18 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:eventfd_util",
         "//test/util:fs_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:save_util",
         "//test/util:temp_path",
         "//test/util:test_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -786,15 +785,15 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -805,13 +804,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -824,11 +823,11 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -841,10 +840,10 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -855,10 +854,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -869,10 +868,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -884,6 +883,9 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:memory_util",
         "//test/util:save_util",
         "//test/util:temp_path",
@@ -892,9 +894,6 @@ cc_binary(
         "//test/util:thread_util",
         "//test/util:time_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -907,12 +906,12 @@ cc_binary(
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -922,9 +921,9 @@ cc_binary(
     srcs = ["getrandom.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -957,10 +956,10 @@ cc_binary(
         ":socket_test_util",
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -984,9 +983,9 @@ cc_binary(
         ":socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -997,6 +996,9 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -1004,9 +1006,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1018,15 +1017,15 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1039,14 +1038,14 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1057,10 +1056,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1071,6 +1070,7 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
@@ -1078,7 +1078,6 @@ cc_binary(
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1089,12 +1088,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        "@com_google_absl//absl/memory",
+        gtest,
         "//test/util:memory_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1104,11 +1103,11 @@ cc_binary(
     srcs = ["mincore.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1121,10 +1120,10 @@ cc_binary(
         ":temp_umask",
         "//test/util:capability_util",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1135,11 +1134,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1151,12 +1150,12 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:cleanup",
+        gtest,
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
         "//test/util:rlimit_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1169,13 +1168,13 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1188,6 +1187,9 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:mount_util",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -1195,9 +1197,6 @@ cc_binary(
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1207,10 +1206,9 @@ cc_binary(
     srcs = ["mremap.cc"],
     linkstatic = 1,
     deps = [
-        # The heap check fails due to MremapDeathTest
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
@@ -1242,9 +1240,9 @@ cc_binary(
     srcs = ["munmap.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1261,14 +1259,14 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1282,10 +1280,10 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1299,11 +1297,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1317,11 +1315,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1333,16 +1331,16 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:pty_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1354,12 +1352,12 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:posix_error",
         "//test/util:pty_util",
         "//test/util:test_main",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1372,12 +1370,12 @@ cc_binary(
         "//test/syscalls/linux:socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1387,13 +1385,13 @@ cc_binary(
     srcs = ["pause.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1405,15 +1403,15 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1426,13 +1424,13 @@ cc_binary(
         ":base_poll_test",
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1443,11 +1441,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         ":base_poll_test",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1458,9 +1456,9 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1472,12 +1470,12 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:cleanup",
+        "@com_google_absl//absl/flags:flag",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1488,13 +1486,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/flags:flag",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1505,10 +1503,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1519,6 +1517,8 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:temp_path",
@@ -1526,8 +1526,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1541,13 +1539,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1559,11 +1557,11 @@ cc_binary(
     deps = [
         "//test/util:capability_util",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1577,6 +1575,10 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:temp_path",
@@ -1584,10 +1586,6 @@ cc_binary(
         "//test/util:thread_util",
         "//test/util:time_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1601,11 +1599,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1617,17 +1615,17 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+        gtest,
         "//test/util:memory_util",
         "//test/util:posix_error",
         "//test/util:proc_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1641,6 +1639,8 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -1648,8 +1648,6 @@ cc_binary(
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1660,11 +1658,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         ":base_poll_test",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1674,6 +1672,9 @@ cc_binary(
     srcs = ["ptrace.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:platform_util",
@@ -1681,9 +1682,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1693,10 +1691,10 @@ cc_binary(
     srcs = ["pwrite64.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1710,12 +1708,12 @@ cc_binary(
     deps = [
         ":file_base",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1729,11 +1727,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -1747,10 +1745,10 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1764,10 +1762,10 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1778,10 +1776,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1792,10 +1790,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1811,13 +1809,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1832,12 +1830,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1851,11 +1849,11 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1879,11 +1877,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/syscalls/linux/rseq:lib",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1894,11 +1892,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        gtest,
         "//test/util:logging",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1908,9 +1906,9 @@ cc_binary(
     srcs = ["sched.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1920,9 +1918,9 @@ cc_binary(
     srcs = ["sched_yield.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1932,6 +1930,8 @@ cc_binary(
     srcs = ["seccomp.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/base:core_headers",
+        gtest,
         "//test/util:logging",
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
@@ -1939,8 +1939,6 @@ cc_binary(
         "//test/util:proc_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1952,14 +1950,14 @@ cc_binary(
     deps = [
         ":base_poll_test",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:rlimit_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1971,13 +1969,13 @@ cc_binary(
     deps = [
         "//test/util:eventfd_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1989,12 +1987,12 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2005,13 +2003,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2021,9 +2019,9 @@ cc_binary(
     srcs = ["sigaction.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2038,13 +2036,13 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:fs_util",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2057,7 +2055,7 @@ cc_binary(
     ),
     linkstatic = 1,
     deps = [
-        "@com_google_googletest//:gtest",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
@@ -2075,14 +2073,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/synchronization",
+        gtest,
         "//test/util:logging",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2092,10 +2090,10 @@ cc_binary(
     srcs = ["sigprocmask.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2105,13 +2103,13 @@ cc_binary(
     srcs = ["sigstop.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2122,13 +2120,13 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2144,10 +2142,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
     alwayslink = 1,
 )
@@ -2160,8 +2158,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2174,8 +2172,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2191,11 +2189,11 @@ cc_library(
     ],
     deps = [
         ":socket_test_util",
-        "//test/util:test_util",
-        "//test/util:thread_util",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
+        "//test/util:thread_util",
     ],
     alwayslink = 1,
 )
@@ -2212,8 +2210,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2230,9 +2228,9 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:memory_util",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2250,8 +2248,8 @@ cc_library(
         ":ip_socket_test_util",
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2268,8 +2266,8 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2286,9 +2284,9 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/memory",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
     alwayslink = 1,
 )
@@ -2305,8 +2303,8 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2323,8 +2321,8 @@ cc_library(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2387,9 +2385,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2419,9 +2417,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2451,9 +2449,9 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2551,10 +2549,10 @@ cc_binary(
         ":socket_bind_to_device_util",
         ":socket_test_util",
         "//test/util:capability_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2570,10 +2568,10 @@ cc_binary(
         ":socket_bind_to_device_util",
         ":socket_test_util",
         "//test/util:capability_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2589,10 +2587,10 @@ cc_binary(
         ":socket_bind_to_device_util",
         ":socket_test_util",
         "//test/util:capability_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2638,9 +2636,9 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2719,15 +2717,15 @@ cc_binary(
         ":ip_socket_test_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:save_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2739,9 +2737,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2755,10 +2753,10 @@ cc_binary(
         ":socket_test_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings:str_format",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2771,9 +2769,9 @@ cc_binary(
         ":socket_netlink_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2791,9 +2789,9 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
-        "//test/util:test_util",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_util",
     ],
     alwayslink = 1,
 )
@@ -2810,11 +2808,11 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2831,10 +2829,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2851,10 +2849,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2871,11 +2869,11 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:timer_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2892,8 +2890,8 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -2910,10 +2908,10 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -3007,9 +3005,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3021,9 +3019,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3035,9 +3033,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3052,9 +3050,9 @@ cc_binary(
         ":socket_blocking_test_cases",
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3069,9 +3067,9 @@ cc_binary(
         ":ip_socket_test_util",
         ":socket_blocking_test_cases",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3086,9 +3084,9 @@ cc_binary(
         ":socket_non_stream_blocking_test_cases",
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3103,9 +3101,9 @@ cc_binary(
         ":ip_socket_test_util",
         ":socket_non_stream_blocking_test_cases",
         ":socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3121,9 +3119,9 @@ cc_binary(
         ":socket_unix_cmsg_test_cases",
         ":socket_unix_test_cases",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3135,9 +3133,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3149,9 +3147,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3164,10 +3162,10 @@ cc_binary(
         ":socket_netlink_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/base:endian",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/base:endian",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3183,12 +3181,12 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3199,11 +3197,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3217,12 +3215,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3235,10 +3233,10 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3248,10 +3246,10 @@ cc_binary(
     srcs = ["sync.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3261,10 +3259,10 @@ cc_binary(
     srcs = ["sysinfo.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3274,9 +3272,9 @@ cc_binary(
     srcs = ["syslog.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3286,10 +3284,10 @@ cc_binary(
     srcs = ["sysret.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3301,12 +3299,12 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3316,11 +3314,11 @@ cc_binary(
     srcs = ["tgkill.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3330,10 +3328,10 @@ cc_binary(
     srcs = ["time.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:proc_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3358,15 +3356,15 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:signal_util",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3376,11 +3374,11 @@ cc_binary(
     srcs = ["tkill.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:logging",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3394,11 +3392,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3414,12 +3412,12 @@ cc_library(
     deps = [
         ":socket_test_util",
         ":unix_domain_socket_test_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
     alwayslink = 1,
 )
@@ -3442,9 +3440,9 @@ cc_binary(
     deps = [
         ":socket_test_util",
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3455,14 +3453,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:uid_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3473,11 +3471,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3490,11 +3488,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3504,11 +3502,11 @@ cc_binary(
     srcs = ["unshare.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/synchronization",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3534,11 +3532,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:fs_util",
+        gtest,
         "//test/util:posix_error",
         "//test/util:proc_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3548,13 +3546,13 @@ cc_binary(
     srcs = ["vfork.cc"],
     linkstatic = 1,
     deps = [
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:test_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3566,6 +3564,10 @@ cc_binary(
     deps = [
         "//test/util:cleanup",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        gtest,
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
@@ -3574,10 +3576,6 @@ cc_binary(
         "//test/util:test_util",
         "//test/util:thread_util",
         "//test/util:time_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3588,10 +3586,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3602,12 +3600,12 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        gtest,
         "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3618,14 +3616,14 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:capability_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
-        "//test/util:thread_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
     ],
 )
 
@@ -3651,10 +3649,10 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        gtest,
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3664,11 +3662,11 @@ cc_binary(
     srcs = ["vdso_clock_gettime.cc"],
     linkstatic = 1,
     deps = [
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -3678,10 +3676,10 @@ cc_binary(
     srcs = ["vsyscall.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:proc_util",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3694,11 +3692,11 @@ cc_binary(
         ":unix_domain_socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
-        "//test/util:test_main",
-        "//test/util:test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
     ],
 )
 
@@ -3710,12 +3708,12 @@ cc_binary(
     deps = [
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        gtest,
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3727,10 +3725,10 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3742,10 +3740,10 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         "//test/util:file_descriptor",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3761,11 +3759,11 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/strings",
+        gtest,
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
diff --git a/test/util/BUILD b/test/util/BUILD
index 1ac8b3fd6..1f22ebe29 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_library", "cc_test", "select_system")
+load("//tools:defs.bzl", "cc_library", "cc_test", "gtest", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -41,7 +41,7 @@ cc_library(
         ":save_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -55,7 +55,7 @@ cc_library(
         ":posix_error",
         ":test_util",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -67,7 +67,7 @@ cc_test(
         ":proc_util",
         ":test_main",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -87,7 +87,7 @@ cc_library(
         ":file_descriptor",
         ":posix_error",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -101,7 +101,7 @@ cc_test(
         ":temp_path",
         ":test_main",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -134,7 +134,7 @@ cc_library(
         ":cleanup",
         ":posix_error",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -183,7 +183,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -194,7 +194,7 @@ cc_test(
     deps = [
         ":posix_error",
         ":test_main",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -218,7 +218,7 @@ cc_library(
         ":cleanup",
         ":posix_error",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -233,7 +233,7 @@ cc_library(
         ":test_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -259,7 +259,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -291,7 +291,7 @@ cc_library(
         ":posix_error",
         ":test_util",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -302,7 +302,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
@@ -322,7 +322,7 @@ cc_library(
         ":file_descriptor",
         ":posix_error",
         ":save_util",
-        "@com_google_googletest//:gtest",
+        gtest,
     ],
 )
 
diff --git a/tools/build/defs.bzl b/tools/build/defs.bzl
index d0556abd1..967c1f900 100644
--- a/tools/build/defs.bzl
+++ b/tools/build/defs.bzl
@@ -18,6 +18,7 @@ cc_test = _cc_test
 cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain"
 go_image = _go_image
 go_embed_data = _go_embed_data
+gtest = "@com_google_googletest//:gtest"
 loopback = "//tools/build:loopback"
 proto_library = native.proto_library
 pkg_deb = _pkg_deb
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 819f12b0d..ce677cbbf 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,7 +7,7 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/build:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/build:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
 
 # Delegate directly.
 cc_binary = _cc_binary
@@ -20,6 +20,7 @@ go_embed_data = _go_embed_data
 go_image = _go_image
 go_test = _go_test
 go_tool_library = _go_tool_library
+gtest = _gtest
 pkg_deb = _pkg_deb
 pkg_tar = _pkg_tar
 py_library = _py_library
diff --git a/tools/images/BUILD b/tools/images/BUILD
index f1699b184..fe11f08a3 100644
--- a/tools/images/BUILD
+++ b/tools/images/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_binary")
+load("//tools:defs.bzl", "cc_binary", "gtest")
 load("//tools/images:defs.bzl", "vm_image", "vm_test")
 
 package(
@@ -32,8 +32,8 @@ cc_binary(
     srcs = ["test.cc"],
     linkstatic = 1,
     deps = [
+        gtest,
         "//test/util:test_main",
-        "@com_google_googletest//:gtest",
     ],
 )
 
-- 
cgit v1.2.3


From 04cccaaeeed22a28a42fc4c1406b43a966a5d886 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 31 Jan 2020 14:44:50 -0800
Subject: Fix logic around AMD/Intel cases.

If the support is Ignored, then the call is still executed. We
simply rely on it to fall through to the int3. Therefore, we
must also bail on the vendor check.

PiperOrigin-RevId: 292620558
---
 test/syscalls/linux/32bit.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 2751fb4e7..9883aef61 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -102,7 +102,8 @@ TEST(Syscall32Bit, Int80) {
 }
 
 TEST(Syscall32Bit, Sysenter) {
-  if (PlatformSupport32Bit() == PlatformSupport::Allowed &&
+  if ((PlatformSupport32Bit() == PlatformSupport::Allowed ||
+       PlatformSupport32Bit() == PlatformSupport::Ignored) &&
       GetCPUVendor() == CPUVendor::kAMD) {
     // SYSENTER is an illegal instruction in compatibility mode on AMD.
     EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
@@ -133,7 +134,8 @@ TEST(Syscall32Bit, Sysenter) {
 }
 
 TEST(Syscall32Bit, Syscall) {
-  if (PlatformSupport32Bit() == PlatformSupport::Allowed &&
+  if ((PlatformSupport32Bit() == PlatformSupport::Allowed ||
+       PlatformSupport32Bit() == PlatformSupport::Ignored) &&
       GetCPUVendor() == CPUVendor::kIntel) {
     // SYSCALL is an illegal instruction in compatibility mode on Intel.
     EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
-- 
cgit v1.2.3


From e7846e50f2df070a15dd33235b334e2223f715f3 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Mon, 3 Feb 2020 15:30:28 -0800
Subject: Reduce run time for
 //test/syscalls:socket_inet_loopback_test_runsc_ptrace.

* Tests are picked for a shard differently. It now picks one test from each
  block, instead of picking the whole block. This makes the same kind of tests
  spreads across different shards.

* Reduce the number of connect() calls in TCPListenClose.

PiperOrigin-RevId: 293019281
---
 runsc/testutil/testutil.go                  | 61 ++++++++++++++---------------
 test/runtimes/runner.go                     |  9 ++---
 test/syscalls/linux/socket_inet_loopback.cc | 13 +++---
 test/syscalls/syscall_test_runner.go        |  7 ++--
 4 files changed, 43 insertions(+), 47 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index fb22eae39..5d0b0ae54 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -434,43 +434,40 @@ func IsStatic(filename string) (bool, error) {
 	return true, nil
 }
 
-// TestBoundsForShard calculates the beginning and end indices for the test
-// based on the TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars. The
-// returned ints are the beginning (inclusive) and end (exclusive) of the
-// subslice corresponding to the shard. If either of the env vars are not
-// present, then the function will return bounds that include all tests. If
-// there are more shards than there are tests, then the returned list may be
-// empty.
-func TestBoundsForShard(numTests int) (int, int, error) {
+// TestIndicesForShard returns indices for this test shard based on the
+// TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars.
+//
+// If either of the env vars are not present, then the function will return all
+// tests. If there are more shards than there are tests, then the returned list
+// may be empty.
+func TestIndicesForShard(numTests int) ([]int, error) {
 	var (
-		begin = 0
-		end   = numTests
+		shardIndex = 0
+		shardTotal = 1
 	)
-	indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS")
-	if indexStr == "" || totalStr == "" {
-		return begin, end, nil
-	}
 
-	// Parse index and total to ints.
-	shardIndex, err := strconv.Atoi(indexStr)
-	if err != nil {
-		return 0, 0, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err)
-	}
-	shardTotal, err := strconv.Atoi(totalStr)
-	if err != nil {
-		return 0, 0, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err)
+	indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS")
+	if indexStr != "" && totalStr != "" {
+		// Parse index and total to ints.
+		var err error
+		shardIndex, err = strconv.Atoi(indexStr)
+		if err != nil {
+			return nil, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err)
+		}
+		shardTotal, err = strconv.Atoi(totalStr)
+		if err != nil {
+			return nil, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err)
+		}
 	}
 
 	// Calculate!
-	shardSize := int(math.Ceil(float64(numTests) / float64(shardTotal)))
-	begin = shardIndex * shardSize
-	end = ((shardIndex + 1) * shardSize)
-	if begin > numTests {
-		// Nothing to run.
-		return 0, 0, nil
-	}
-	if end > numTests {
-		end = numTests
+	var indices []int
+	numBlocks := int(math.Ceil(float64(numTests) / float64(shardTotal)))
+	for i := 0; i < numBlocks; i++ {
+		pick := i*shardTotal + shardIndex
+		if pick < numTests {
+			indices = append(indices, pick)
+		}
 	}
-	return begin, end, nil
+	return indices, nil
 }
diff --git a/test/runtimes/runner.go b/test/runtimes/runner.go
index bec37c69d..ddb890dbc 100644
--- a/test/runtimes/runner.go
+++ b/test/runtimes/runner.go
@@ -20,7 +20,6 @@ import (
 	"flag"
 	"fmt"
 	"io"
-	"log"
 	"os"
 	"sort"
 	"strings"
@@ -101,17 +100,15 @@ func getTests(d dockerutil.Docker, blacklist map[string]struct{}) ([]testing.Int
 	// shard.
 	tests := strings.Fields(list)
 	sort.Strings(tests)
-	begin, end, err := testutil.TestBoundsForShard(len(tests))
+	indices, err := testutil.TestIndicesForShard(len(tests))
 	if err != nil {
 		return nil, fmt.Errorf("TestsForShard() failed: %v", err)
 	}
-	log.Printf("Got bounds [%d:%d) for shard out of %d total tests", begin, end, len(tests))
-	tests = tests[begin:end]
 
 	var itests []testing.InternalTest
-	for _, tc := range tests {
+	for _, tci := range indices {
 		// Capture tc in this scope.
-		tc := tc
+		tc := tests[tci]
 		itests = append(itests, testing.InternalTest{
 			Name: tc,
 			F: func(t *testing.T) {
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 3bf7081b9..b24618a88 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -325,6 +325,12 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
+  constexpr int kAcceptCount = 32;
+  constexpr int kBacklog = kAcceptCount * 2;
+  constexpr int kFDs = 128;
+  constexpr int kThreadCount = 4;
+  constexpr int kFDsPerThread = kFDs / kThreadCount;
+
   // Create the listening socket.
   FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
@@ -332,7 +338,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
                    listener.addr_len),
               SyscallSucceeds());
-  ASSERT_THAT(listen(listen_fd.get(), 1001), SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
   socklen_t addrlen = listener.addr_len;
@@ -345,9 +351,6 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   DisableSave ds;  // Too many system calls.
   sockaddr_storage conn_addr = connector.addr;
   ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
-  constexpr int kFDs = 2048;
-  constexpr int kThreadCount = 4;
-  constexpr int kFDsPerThread = kFDs / kThreadCount;
   FileDescriptor clients[kFDs];
   std::unique_ptr<ScopedThread> threads[kThreadCount];
   for (int i = 0; i < kFDs; i++) {
@@ -371,7 +374,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   for (int i = 0; i < kThreadCount; i++) {
     threads[i]->Join();
   }
-  for (int i = 0; i < 32; i++) {
+  for (int i = 0; i < kAcceptCount; i++) {
     auto accepted =
         ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
   }
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index b9fd885ff..ae342b68c 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -450,17 +450,16 @@ func main() {
 	}
 
 	// Get subset of tests corresponding to shard.
-	begin, end, err := testutil.TestBoundsForShard(len(testCases))
+	indices, err := testutil.TestIndicesForShard(len(testCases))
 	if err != nil {
 		fatalf("TestsForShard() failed: %v", err)
 	}
-	testCases = testCases[begin:end]
 
 	// Run the tests.
 	var tests []testing.InternalTest
-	for _, tc := range testCases {
+	for _, tci := range indices {
 		// Capture tc.
-		tc := tc
+		tc := testCases[tci]
 		testName := fmt.Sprintf("%s_%s", tc.Suite, tc.Name)
 		tests = append(tests, testing.InternalTest{
 			Name: testName,
-- 
cgit v1.2.3


From c5d4041623ac6405135e966af6d06c178a86870d Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Tue, 4 Feb 2020 12:53:10 -0800
Subject: Include socket_ip_udp_loopback.cc in exportes_files

So it can be included in fuchsia's syscall tests

PiperOrigin-RevId: 293208306
---
 test/syscalls/linux/BUILD | 1 +
 1 file changed, 1 insertion(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e4ca5b6db..737e2329f 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -11,6 +11,7 @@ exports_files(
         "socket_inet_loopback.cc",
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_loopback.cc",
+        "socket_ip_udp_loopback.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
         "udp_socket.cc",
-- 
cgit v1.2.3


From 6823b5e244a5748032130574ae3a25a0a36bbbf5 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 4 Feb 2020 13:05:30 -0800
Subject: timer_create(2) should return 0 on success

The timer ID is copied out to the argument.

Fixes #1738

PiperOrigin-RevId: 293210801
---
 pkg/sentry/syscalls/linux/sys_timer.go |  2 +-
 test/syscalls/linux/timers.cc          | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index 432351917..a4c400f87 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -146,7 +146,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 		return 0, nil, err
 	}
 
-	return uintptr(id), nil, nil
+	return 0, nil, nil
 }
 
 // TimerSettime implements linux syscall timer_settime(2).
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
index 3db18d7ac..2f92c27da 100644
--- a/test/syscalls/linux/timers.cc
+++ b/test/syscalls/linux/timers.cc
@@ -297,9 +297,13 @@ class IntervalTimer {
 PosixErrorOr<IntervalTimer> TimerCreate(clockid_t clockid,
                                         const struct sigevent& sev) {
   int timerid;
-  if (syscall(SYS_timer_create, clockid, &sev, &timerid) < 0) {
+  int ret = syscall(SYS_timer_create, clockid, &sev, &timerid);
+  if (ret < 0) {
     return PosixError(errno, "timer_create");
   }
+  if (ret > 0) {
+    return PosixError(EINVAL, "timer_create should never return positive");
+  }
   MaybeSave();
   return IntervalTimer(timerid);
 }
@@ -317,6 +321,18 @@ TEST(IntervalTimerTest, IsInitiallyStopped) {
   EXPECT_EQ(0, its.it_value.tv_nsec);
 }
 
+// Kernel can create multiple timers without issue.
+//
+// Regression test for gvisor.dev/issue/1738.
+TEST(IntervalTimerTest, MultipleTimers) {
+  struct sigevent sev = {};
+  sev.sigev_notify = SIGEV_NONE;
+  const auto timer1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+  const auto timer2 =
+      ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+}
+
 TEST(IntervalTimerTest, SingleShotSilent) {
   struct sigevent sev = {};
   sev.sigev_notify = SIGEV_NONE;
-- 
cgit v1.2.3


From a26a954946ad2e7910d3ad7578960a93b73a1f9b Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 4 Feb 2020 15:20:30 -0800
Subject: Add socket connection stress test.

Tests 65k connection attempts on common types of sockets to check for port
leaks.

Also fixes a bug where dual-stack sockets wouldn't properly re-queue
segments received while closing.

PiperOrigin-RevId: 293241166
---
 pkg/tcpip/transport/tcp/connect.go           |   4 ++
 test/syscalls/BUILD                          |   9 +++
 test/syscalls/linux/BUILD                    |  17 +++++
 test/syscalls/linux/ip_socket_test_util.cc   |  27 +++++++
 test/syscalls/linux/ip_socket_test_util.h    |  15 ++++
 test/syscalls/linux/socket_generic_stress.cc |  83 ++++++++++++++++++++++
 test/syscalls/linux/socket_test_util.cc      | 101 ++++++++++++++++++++++++---
 test/syscalls/linux/socket_test_util.h       |   6 ++
 8 files changed, 251 insertions(+), 11 deletions(-)
 create mode 100644 test/syscalls/linux/socket_generic_stress.cc

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 9ff7ac261..5c5397823 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -989,6 +989,10 @@ func (e *endpoint) transitionToStateCloseLocked() {
 // to any other listening endpoint. We reply with RST if we cannot find one.
 func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, &s.route)
+	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.EndpointInfo.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
+		// Dual-stack socket, try IPv4.
+		ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, &s.route)
+	}
 	if ep == nil {
 		replyWithReset(s)
 		s.decRef()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 8f2b75a1c..31d239c0e 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -45,6 +45,15 @@ syscall_test(test = "//test/syscalls/linux:brk_test")
 
 syscall_test(test = "//test/syscalls/linux:socket_test")
 
+syscall_test(
+    size = "large",
+    shard_count = 50,
+    # Takes too long for TSAN. Since this is kind of a stress test that doesn't
+    # involve much concurrency, TSAN's usefulness here is limited anyway.
+    tags = ["nogotsan"],
+    test = "//test/syscalls/linux:socket_stress_test",
+)
+
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:chdir_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 737e2329f..273b014d6 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -136,6 +136,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:optional",
         "//test/util:file_descriptor",
         "//test/util:posix_error",
         "//test/util:temp_path",
@@ -2151,6 +2152,22 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_binary(
+    name = "socket_stress_test",
+    testonly = 1,
+    srcs = [
+        "socket_generic_stress.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_library(
     name = "socket_unix_dgram_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 6b472eb2f..bba022a41 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -79,6 +79,33 @@ SocketPairKind DualStackTCPAcceptBindSocketPair(int type) {
                                      /* dual_stack = */ true)};
 }
 
+SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected IPv6 TCP socket");
+  return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET6, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ false)};
+}
+
+SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected IPv4 TCP socket");
+  return SocketPairKind{description, AF_INET, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ false)};
+}
+
+SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "connected dual stack TCP socket");
+  return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+                        TCPAcceptBindPersistentListenerSocketPairCreator(
+                            AF_INET6, type | SOCK_STREAM, 0,
+                            /* dual_stack = */ true)};
+}
+
 SocketPairKind IPv6UDPBidirectionalBindSocketPair(int type) {
   std::string description =
       absl::StrCat(DescribeSocketType(type), "connected IPv6 UDP socket");
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 0f58e0f77..083ebbcf0 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -50,6 +50,21 @@ SocketPairKind IPv4TCPAcceptBindSocketPair(int type);
 // given type bound to the IPv4 loopback.
 SocketPairKind DualStackTCPAcceptBindSocketPair(int type);
 
+// IPv6TCPAcceptBindPersistentListenerSocketPair is like
+// IPv6TCPAcceptBindSocketPair except it uses a persistent listening socket to
+// create all socket pairs.
+SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type);
+
+// IPv4TCPAcceptBindPersistentListenerSocketPair is like
+// IPv4TCPAcceptBindSocketPair except it uses a persistent listening socket to
+// create all socket pairs.
+SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type);
+
+// DualStackTCPAcceptBindPersistentListenerSocketPair is like
+// DualStackTCPAcceptBindSocketPair except it uses a persistent listening socket
+// to create all socket pairs.
+SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type);
+
 // IPv6UDPBidirectionalBindSocketPair returns a SocketPairKind that represents
 // SocketPairs created with bind() and connect() syscalls with AF_INET6 and the
 // given type bound to the IPv6 loopback.
diff --git a/test/syscalls/linux/socket_generic_stress.cc b/test/syscalls/linux/socket_generic_stress.cc
new file mode 100644
index 000000000..6a232238d
--- /dev/null
+++ b/test/syscalls/linux/socket_generic_stress.cc
@@ -0,0 +1,83 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected sockets.
+using ConnectStressTest = SocketPairTest;
+
+TEST_P(ConnectStressTest, Reset65kTimes) {
+  for (int i = 0; i < 1 << 16; ++i) {
+    auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+    // Send some data to ensure that the connection gets reset and the port gets
+    // released immediately. This avoids either end entering TIME-WAIT.
+    char sent_data[100] = {};
+    ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+                SyscallSucceedsWithValue(sizeof(sent_data)));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllConnectedSockets, ConnectStressTest,
+    ::testing::Values(IPv6UDPBidirectionalBindSocketPair(0),
+                      IPv4UDPBidirectionalBindSocketPair(0),
+                      DualStackUDPBidirectionalBindSocketPair(0),
+
+                      // Without REUSEADDR, we get port exhaustion on Linux.
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR,
+                                 &kSockOptOn)(IPv6TCPAcceptBindSocketPair(0)),
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR,
+                                 &kSockOptOn)(IPv4TCPAcceptBindSocketPair(0)),
+                      SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+                          DualStackTCPAcceptBindSocketPair(0))));
+
+// Test fixture for tests that apply to pairs of connected sockets created with
+// a persistent listener (if applicable).
+using PersistentListenerConnectStressTest = SocketPairTest;
+
+TEST_P(PersistentListenerConnectStressTest, 65kTimes) {
+  for (int i = 0; i < 1 << 16; ++i) {
+    auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllConnectedSockets, PersistentListenerConnectStressTest,
+    ::testing::Values(
+        IPv6UDPBidirectionalBindSocketPair(0),
+        IPv4UDPBidirectionalBindSocketPair(0),
+        DualStackUDPBidirectionalBindSocketPair(0),
+
+        // Without REUSEADDR, we get port exhaustion on Linux.
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            IPv6TCPAcceptBindPersistentListenerSocketPair(0)),
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            IPv4TCPAcceptBindPersistentListenerSocketPair(0)),
+        SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+            DualStackTCPAcceptBindPersistentListenerSocketPair(0))));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index eff7d577e..c0c5ab3fe 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -18,10 +18,13 @@
 #include <poll.h>
 #include <sys/socket.h>
 
+#include <memory>
+
 #include "gtest/gtest.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/clock.h"
+#include "absl/types/optional.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
@@ -109,7 +112,10 @@ Creator<SocketPair> AcceptBindSocketPairCreator(bool abstract, int domain,
       MaybeSave();  // Unlinked path.
     }
 
-    return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
+    // accepted is before connected to destruct connected before accepted.
+    // Destructors for nonstatic member objects are called in the reverse order
+    // in which they appear in the class declaration.
+    return absl::make_unique<AddrFDSocketPair>(accepted, connected, bind_addr,
                                                extra_addr);
   };
 }
@@ -311,11 +317,16 @@ PosixErrorOr<T> BindIP(int fd, bool dual_stack) {
 }
 
 template <typename T>
-PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
-    int bound, int connected, int type, bool dual_stack) {
-  ASSIGN_OR_RETURN_ERRNO(T bind_addr, BindIP<T>(bound, dual_stack));
-  RETURN_ERROR_IF_SYSCALL_FAIL(listen(bound, /* backlog = */ 5));
+PosixErrorOr<T> TCPBindAndListen(int fd, bool dual_stack) {
+  ASSIGN_OR_RETURN_ERRNO(T addr, BindIP<T>(fd, dual_stack));
+  RETURN_ERROR_IF_SYSCALL_FAIL(listen(fd, /* backlog = */ 5));
+  return addr;
+}
 
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>>
+CreateTCPConnectAcceptSocketPair(int bound, int connected, int type,
+                                 bool dual_stack, T bind_addr) {
   int connect_result = 0;
   RETURN_ERROR_IF_SYSCALL_FAIL(
       (connect_result = RetryEINTR(connect)(
@@ -358,16 +369,27 @@ PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
     absl::SleepFor(absl::Seconds(1));
   }
 
-  // Cleanup no longer needed resources.
-  RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
-  MaybeSave();  // Successful close.
-
   T extra_addr = {};
   LocalhostAddr(&extra_addr, dual_stack);
   return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
                                              extra_addr);
 }
 
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
+    int bound, int connected, int type, bool dual_stack) {
+  ASSIGN_OR_RETURN_ERRNO(T bind_addr, TCPBindAndListen<T>(bound, dual_stack));
+
+  auto result = CreateTCPConnectAcceptSocketPair(bound, connected, type,
+                                                 dual_stack, bind_addr);
+
+  // Cleanup no longer needed resources.
+  RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
+  MaybeSave();  // Successful close.
+
+  return result;
+}
+
 Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
                                                    int protocol,
                                                    bool dual_stack) {
@@ -389,6 +411,63 @@ Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
   };
 }
 
+Creator<SocketPair> TCPAcceptBindPersistentListenerSocketPairCreator(
+    int domain, int type, int protocol, bool dual_stack) {
+  // These are lazily initialized below, on the first call to the returned
+  // lambda. These values are private to each returned lambda, but shared across
+  // invocations of a specific lambda.
+  //
+  // The sharing allows pairs created with the same parameters to share a
+  // listener. This prevents future connects from failing if the connecting
+  // socket selects a port which had previously been used by a listening socket
+  // that still has some connections in TIME-WAIT.
+  //
+  // The lazy initialization is to avoid creating sockets during parameter
+  // enumeration. This is important because parameters are enumerated during the
+  // build process where networking may not be available.
+  auto listener = std::make_shared<absl::optional<int>>(absl::optional<int>());
+  auto addr4 = std::make_shared<absl::optional<sockaddr_in>>(
+      absl::optional<sockaddr_in>());
+  auto addr6 = std::make_shared<absl::optional<sockaddr_in6>>(
+      absl::optional<sockaddr_in6>());
+
+  return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+    int connected;
+    RETURN_ERROR_IF_SYSCALL_FAIL(connected = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+
+    // Share the listener across invocations.
+    if (!listener->has_value()) {
+      int fd = socket(domain, type, protocol);
+      if (fd < 0) {
+        return PosixError(errno, absl::StrCat("socket(", domain, ", ", type,
+                                              ", ", protocol, ")"));
+      }
+      listener->emplace(fd);
+      MaybeSave();  // Successful socket creation.
+    }
+
+    // Bind the listener once, but create a new connect/accept pair each
+    // time.
+    if (domain == AF_INET) {
+      if (!addr4->has_value()) {
+        addr4->emplace(
+            TCPBindAndListen<sockaddr_in>(listener->value(), dual_stack)
+                .ValueOrDie());
+      }
+      return CreateTCPConnectAcceptSocketPair(listener->value(), connected,
+                                              type, dual_stack, addr4->value());
+    }
+    if (!addr6->has_value()) {
+      addr6->emplace(
+          TCPBindAndListen<sockaddr_in6>(listener->value(), dual_stack)
+              .ValueOrDie());
+    }
+    return CreateTCPConnectAcceptSocketPair(listener->value(), connected, type,
+                                            dual_stack, addr6->value());
+  };
+}
+
 template <typename T>
 PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateUDPBoundSocketPair(
     int sock1, int sock2, int type, bool dual_stack) {
@@ -518,8 +597,8 @@ size_t CalculateUnixSockAddrLen(const char* sun_path) {
   if (sun_path[0] == 0) {
     return sizeof(sockaddr_un);
   }
-  // Filesystem addresses use the address length plus the 2 byte sun_family and
-  // null terminator.
+  // Filesystem addresses use the address length plus the 2 byte sun_family
+  // and null terminator.
   return strlen(sun_path) + 3;
 }
 
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index 2dbb8bed3..bfaa6e397 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -273,6 +273,12 @@ Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
                                                    int protocol,
                                                    bool dual_stack);
 
+// TCPAcceptBindPersistentListenerSocketPairCreator is like
+// TCPAcceptBindSocketPairCreator, except it uses the same listening socket to
+// create all SocketPairs.
+Creator<SocketPair> TCPAcceptBindPersistentListenerSocketPairCreator(
+    int domain, int type, int protocol, bool dual_stack);
+
 // UDPBidirectionalBindSocketPairCreator returns a Creator<SocketPair> that
 // obtains file descriptors by invoking the bind() and connect() syscalls on UDP
 // sockets.
-- 
cgit v1.2.3


From 665b614e4a6e715bac25bea15c5c29184016e549 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Tue, 4 Feb 2020 18:04:26 -0800
Subject: Support RTM_NEWADDR and RTM_GETLINK in (rt)netlink.

PiperOrigin-RevId: 293271055
---
 pkg/sentry/inet/inet.go                      |   4 +
 pkg/sentry/inet/test_stack.go                |   6 +
 pkg/sentry/socket/hostinet/stack.go          |   5 +
 pkg/sentry/socket/netlink/BUILD              |  14 +-
 pkg/sentry/socket/netlink/message.go         | 129 +++++++++++
 pkg/sentry/socket/netlink/message_test.go    | 312 +++++++++++++++++++++++++++
 pkg/sentry/socket/netlink/provider.go        |   2 +-
 pkg/sentry/socket/netlink/route/BUILD        |   2 -
 pkg/sentry/socket/netlink/route/protocol.go  | 238 ++++++++++++++------
 pkg/sentry/socket/netlink/socket.go          |  54 ++---
 pkg/sentry/socket/netlink/uevent/protocol.go |   2 +-
 pkg/sentry/socket/netstack/stack.go          |  55 +++++
 pkg/tcpip/stack/stack.go                     |   9 +
 test/syscalls/linux/BUILD                    |   2 +
 test/syscalls/linux/socket_netlink_route.cc  | 296 ++++++++++++++++++++-----
 test/syscalls/linux/socket_netlink_util.cc   |  45 +++-
 test/syscalls/linux/socket_netlink_util.h    |   9 +
 17 files changed, 1022 insertions(+), 162 deletions(-)
 create mode 100644 pkg/sentry/socket/netlink/message_test.go

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index a7dfb78a7..2916a0644 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -28,6 +28,10 @@ type Stack interface {
 	// interface indexes to a slice of associated interface address properties.
 	InterfaceAddrs() map[int32][]InterfaceAddr
 
+	// AddInterfaceAddr adds an address to the network interface identified by
+	// index.
+	AddInterfaceAddr(idx int32, addr InterfaceAddr) error
+
 	// SupportsIPv6 returns true if the stack supports IPv6 connectivity.
 	SupportsIPv6() bool
 
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index dcfcbd97e..d8961fc94 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -47,6 +47,12 @@ func (s *TestStack) InterfaceAddrs() map[int32][]InterfaceAddr {
 	return s.InterfaceAddrsMap
 }
 
+// AddInterfaceAddr implements Stack.AddInterfaceAddr.
+func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error {
+	s.InterfaceAddrsMap[idx] = append(s.InterfaceAddrsMap[idx], addr)
+	return nil
+}
+
 // SupportsIPv6 implements Stack.SupportsIPv6.
 func (s *TestStack) SupportsIPv6() bool {
 	return s.SupportsIPv6Flag
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index 034eca676..a48082631 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -310,6 +310,11 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 	return addrs
 }
 
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	return syserror.EACCES
+}
+
 // SupportsIPv6 implements inet.Stack.SupportsIPv6.
 func (s *Stack) SupportsIPv6() bool {
 	return s.supportsIPv6
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index f8b8e467d..1911cd9b8 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -33,3 +33,15 @@ go_library(
         "//pkg/waiter",
     ],
 )
+
+go_test(
+    name = "netlink_test",
+    size = "small",
+    srcs = [
+        "message_test.go",
+    ],
+    deps = [
+        ":netlink",
+        "//pkg/abi/linux",
+    ],
+)
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index b21e0ca4b..4ea252ccb 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -30,8 +30,16 @@ func alignUp(length int, align uint) int {
 	return (length + int(align) - 1) &^ (int(align) - 1)
 }
 
+// alignPad returns the length of padding required for alignment.
+//
+// Preconditions: align is a power of two.
+func alignPad(length int, align uint) int {
+	return alignUp(length, align) - length
+}
+
 // Message contains a complete serialized netlink message.
 type Message struct {
+	hdr linux.NetlinkMessageHeader
 	buf []byte
 }
 
@@ -40,10 +48,86 @@ type Message struct {
 // The header length will be updated by Finalize.
 func NewMessage(hdr linux.NetlinkMessageHeader) *Message {
 	return &Message{
+		hdr: hdr,
 		buf: binary.Marshal(nil, usermem.ByteOrder, hdr),
 	}
 }
 
+// ParseMessage parses the first message seen at buf, returning the rest of the
+// buffer. If message is malformed, ok of false is returned. For last message,
+// padding check is loose, if there isn't enought padding, whole buf is consumed
+// and ok is set to true.
+func ParseMessage(buf []byte) (msg *Message, rest []byte, ok bool) {
+	b := BytesView(buf)
+
+	hdrBytes, ok := b.Extract(linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return
+	}
+	var hdr linux.NetlinkMessageHeader
+	binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr)
+
+	// Msg portion.
+	totalMsgLen := int(hdr.Length)
+	_, ok = b.Extract(totalMsgLen - linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return
+	}
+
+	// Padding.
+	numPad := alignPad(totalMsgLen, linux.NLMSG_ALIGNTO)
+	// Linux permits the last message not being aligned, just consume all of it.
+	// Ref: net/netlink/af_netlink.c:netlink_rcv_skb
+	if numPad > len(b) {
+		numPad = len(b)
+	}
+	_, ok = b.Extract(numPad)
+	if !ok {
+		return
+	}
+
+	return &Message{
+		hdr: hdr,
+		buf: buf[:totalMsgLen],
+	}, []byte(b), true
+}
+
+// Header returns the header of this message.
+func (m *Message) Header() linux.NetlinkMessageHeader {
+	return m.hdr
+}
+
+// GetData unmarshals the payload message header from this netlink message, and
+// returns the attributes portion.
+func (m *Message) GetData(msg interface{}) (AttrsView, bool) {
+	b := BytesView(m.buf)
+
+	_, ok := b.Extract(linux.NetlinkMessageHeaderSize)
+	if !ok {
+		return nil, false
+	}
+
+	size := int(binary.Size(msg))
+	msgBytes, ok := b.Extract(size)
+	if !ok {
+		return nil, false
+	}
+	binary.Unmarshal(msgBytes, usermem.ByteOrder, msg)
+
+	numPad := alignPad(linux.NetlinkMessageHeaderSize+size, linux.NLMSG_ALIGNTO)
+	// Linux permits the last message not being aligned, just consume all of it.
+	// Ref: net/netlink/af_netlink.c:netlink_rcv_skb
+	if numPad > len(b) {
+		numPad = len(b)
+	}
+	_, ok = b.Extract(numPad)
+	if !ok {
+		return nil, false
+	}
+
+	return AttrsView(b), true
+}
+
 // Finalize returns the []byte containing the entire message, with the total
 // length set in the message header. The Message must not be modified after
 // calling Finalize.
@@ -157,3 +241,48 @@ func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message {
 	ms.Messages = append(ms.Messages, m)
 	return m
 }
+
+// AttrsView is a view into the attributes portion of a netlink message.
+type AttrsView []byte
+
+// Empty returns whether there is no attribute left in v.
+func (v AttrsView) Empty() bool {
+	return len(v) == 0
+}
+
+// ParseFirst parses first netlink attribute at the beginning of v.
+func (v AttrsView) ParseFirst() (hdr linux.NetlinkAttrHeader, value []byte, rest AttrsView, ok bool) {
+	b := BytesView(v)
+
+	hdrBytes, ok := b.Extract(linux.NetlinkAttrHeaderSize)
+	if !ok {
+		return
+	}
+	binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr)
+
+	value, ok = b.Extract(int(hdr.Length) - linux.NetlinkAttrHeaderSize)
+	if !ok {
+		return
+	}
+
+	_, ok = b.Extract(alignPad(int(hdr.Length), linux.NLA_ALIGNTO))
+	if !ok {
+		return
+	}
+
+	return hdr, value, AttrsView(b), ok
+}
+
+// BytesView supports extracting data from a byte slice with bounds checking.
+type BytesView []byte
+
+// Extract removes the first n bytes from v and returns it. If n is out of
+// bounds, it returns false.
+func (v *BytesView) Extract(n int) ([]byte, bool) {
+	if n < 0 || n > len(*v) {
+		return nil, false
+	}
+	extracted := (*v)[:n]
+	*v = (*v)[n:]
+	return extracted, true
+}
diff --git a/pkg/sentry/socket/netlink/message_test.go b/pkg/sentry/socket/netlink/message_test.go
new file mode 100644
index 000000000..ef13d9386
--- /dev/null
+++ b/pkg/sentry/socket/netlink/message_test.go
@@ -0,0 +1,312 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package message_test
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
+)
+
+type dummyNetlinkMsg struct {
+	Foo uint16
+}
+
+func TestParseMessage(t *testing.T) {
+	tests := []struct {
+		desc  string
+		input []byte
+
+		header  linux.NetlinkMessageHeader
+		dataMsg *dummyNetlinkMsg
+		restLen int
+		ok      bool
+	}{
+		{
+			desc: "valid",
+			input: []byte{
+				0x14, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 20,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "valid with next message",
+			input: []byte{
+				0x14, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+				0xFF, // Next message (rest)
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 20,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 1,
+			ok:      true,
+		},
+		{
+			desc: "valid for last message without padding",
+			input: []byte{
+				0x12, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, // Data message
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 18,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "valid for last message not to be aligned",
+			input: []byte{
+				0x13, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, // Data message
+				0x00, // Excessive 1 byte permitted at end
+			},
+			header: linux.NetlinkMessageHeader{
+				Length: 19,
+				Type:   1,
+				Flags:  2,
+				Seq:    3,
+				PortID: 4,
+			},
+			dataMsg: &dummyNetlinkMsg{
+				Foo: 0x3130,
+			},
+			restLen: 0,
+			ok:      true,
+		},
+		{
+			desc: "header.Length too short",
+			input: []byte{
+				0x04, 0x00, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			ok: false,
+		},
+		{
+			desc: "header.Length too long",
+			input: []byte{
+				0xFF, 0xFF, 0x00, 0x00, // Length
+				0x01, 0x00, // Type
+				0x02, 0x00, // Flags
+				0x03, 0x00, 0x00, 0x00, // Seq
+				0x04, 0x00, 0x00, 0x00, // PortID
+				0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding
+			},
+			ok: false,
+		},
+		{
+			desc: "header incomplete",
+			input: []byte{
+				0x04, 0x00, 0x00, 0x00, // Length
+			},
+			ok: false,
+		},
+		{
+			desc:  "empty message",
+			input: []byte{},
+			ok:    false,
+		},
+	}
+	for _, test := range tests {
+		msg, rest, ok := netlink.ParseMessage(test.input)
+		if ok != test.ok {
+			t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok)
+			continue
+		}
+		if !test.ok {
+			continue
+		}
+		if !reflect.DeepEqual(msg.Header(), test.header) {
+			t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, msg.Header(), test.header)
+		}
+
+		dataMsg := &dummyNetlinkMsg{}
+		_, dataOk := msg.GetData(dataMsg)
+		if !dataOk {
+			t.Errorf("%v: GetData.ok = %v, want = true", test.desc, dataOk)
+		} else if !reflect.DeepEqual(dataMsg, test.dataMsg) {
+			t.Errorf("%v: GetData.msg = %+v, want = %+v", test.desc, dataMsg, test.dataMsg)
+		}
+
+		if got, want := rest, test.input[len(test.input)-test.restLen:]; !bytes.Equal(got, want) {
+			t.Errorf("%v: got rest = %v, want = %v", test.desc, got, want)
+		}
+	}
+}
+
+func TestAttrView(t *testing.T) {
+	tests := []struct {
+		desc  string
+		input []byte
+
+		// Outputs for ParseFirst.
+		hdr     linux.NetlinkAttrHeader
+		value   []byte
+		restLen int
+		ok      bool
+
+		// Outputs for Empty.
+		isEmpty bool
+	}{
+		{
+			desc: "valid",
+			input: []byte{
+				0x06, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x00, 0x00, // Data with 2 bytes padding
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 6,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31},
+			restLen: 0,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "at alignment",
+			input: []byte{
+				0x08, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 8,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31, 0x32, 0x33},
+			restLen: 0,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "at alignment with rest data",
+			input: []byte{
+				0x08, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+				0xFF, 0xFE, // Rest data
+			},
+			hdr: linux.NetlinkAttrHeader{
+				Length: 8,
+				Type:   1,
+			},
+			value:   []byte{0x30, 0x31, 0x32, 0x33},
+			restLen: 2,
+			ok:      true,
+			isEmpty: false,
+		},
+		{
+			desc: "hdr.Length too long",
+			input: []byte{
+				0xFF, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			ok:      false,
+			isEmpty: false,
+		},
+		{
+			desc: "hdr.Length too short",
+			input: []byte{
+				0x01, 0x00, // Length
+				0x01, 0x00, // Type
+				0x30, 0x31, 0x32, 0x33, // Data
+			},
+			ok:      false,
+			isEmpty: false,
+		},
+		{
+			desc:    "empty",
+			input:   []byte{},
+			ok:      false,
+			isEmpty: true,
+		},
+	}
+	for _, test := range tests {
+		attrs := netlink.AttrsView(test.input)
+
+		// Test ParseFirst().
+		hdr, value, rest, ok := attrs.ParseFirst()
+		if ok != test.ok {
+			t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok)
+		} else if test.ok {
+			if !reflect.DeepEqual(hdr, test.hdr) {
+				t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, hdr, test.hdr)
+			}
+			if !bytes.Equal(value, test.value) {
+				t.Errorf("%v: got value = %v, want = %v", test.desc, value, test.value)
+			}
+			if wantRest := test.input[len(test.input)-test.restLen:]; !bytes.Equal(rest, wantRest) {
+				t.Errorf("%v: got rest = %v, want = %v", test.desc, rest, wantRest)
+			}
+		}
+
+		// Test Empty().
+		if got, want := attrs.Empty(), test.isEmpty; got != want {
+			t.Errorf("%v: got empty = %v, want = %v", test.desc, got, want)
+		}
+	}
+}
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 07f860a49..b0dc70e5c 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -42,7 +42,7 @@ type Protocol interface {
 	// If err == nil, any messages added to ms will be sent back to the
 	// other end of the socket. Setting ms.Multi will cause an NLMSG_DONE
 	// message to be sent even if ms contains no messages.
-	ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *MessageSet) *syserr.Error
+	ProcessMessage(ctx context.Context, msg *Message, ms *MessageSet) *syserr.Error
 }
 
 // Provider is a function that creates a new Protocol for a specific netlink
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 622a1eafc..93127398d 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -10,13 +10,11 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/netlink",
         "//pkg/syserr",
-        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 2b3c7f5b3..c84d8bd7c 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -17,16 +17,15 @@ package route
 
 import (
 	"bytes"
+	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // commandKind describes the operational class of a message type.
@@ -69,13 +68,7 @@ func (p *Protocol) CanSend() bool {
 }
 
 // dumpLinks handles RTM_GETLINK dump requests.
-func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
-	}
-
+func (p *Protocol) dumpLinks(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// NLM_F_DUMP + RTM_GETLINK messages are supposed to include an
 	// ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some
 	// userspace applications (including glibc) still include rtgenmsg.
@@ -99,44 +92,105 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 		return nil
 	}
 
-	for id, i := range stack.Interfaces() {
-		m := ms.AddMessage(linux.NetlinkMessageHeader{
-			Type: linux.RTM_NEWLINK,
-		})
+	for idx, i := range stack.Interfaces() {
+		addNewLinkMessage(ms, idx, i)
+	}
 
-		m.Put(linux.InterfaceInfoMessage{
-			Family: linux.AF_UNSPEC,
-			Type:   i.DeviceType,
-			Index:  id,
-			Flags:  i.Flags,
-		})
+	return nil
+}
 
-		m.PutAttrString(linux.IFLA_IFNAME, i.Name)
-		m.PutAttr(linux.IFLA_MTU, i.MTU)
+// getLinks handles RTM_GETLINK requests.
+func (p *Protocol) getLink(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network devices.
+		return nil
+	}
 
-		mac := make([]byte, 6)
-		brd := mac
-		if len(i.Addr) > 0 {
-			mac = i.Addr
-			brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
+	// Parse message.
+	var ifi linux.InterfaceInfoMessage
+	attrs, ok := msg.GetData(&ifi)
+	if !ok {
+		return syserr.ErrInvalidArgument
+	}
+
+	// Parse attributes.
+	var byName []byte
+	for !attrs.Empty() {
+		ahdr, value, rest, ok := attrs.ParseFirst()
+		if !ok {
+			return syserr.ErrInvalidArgument
 		}
-		m.PutAttr(linux.IFLA_ADDRESS, mac)
-		m.PutAttr(linux.IFLA_BROADCAST, brd)
+		attrs = rest
 
-		// TODO(gvisor.dev/issue/578): There are many more attributes.
+		switch ahdr.Type {
+		case linux.IFLA_IFNAME:
+			if len(value) < 1 {
+				return syserr.ErrInvalidArgument
+			}
+			byName = value[:len(value)-1]
+
+			// TODO(gvisor.dev/issue/578): Support IFLA_EXT_MASK.
+		}
 	}
 
+	found := false
+	for idx, i := range stack.Interfaces() {
+		switch {
+		case ifi.Index > 0:
+			if idx != ifi.Index {
+				continue
+			}
+		case byName != nil:
+			if string(byName) != i.Name {
+				continue
+			}
+		default:
+			// Criteria not specified.
+			return syserr.ErrInvalidArgument
+		}
+
+		addNewLinkMessage(ms, idx, i)
+		found = true
+		break
+	}
+	if !found {
+		return syserr.ErrNoDevice
+	}
 	return nil
 }
 
-// dumpAddrs handles RTM_GETADDR dump requests.
-func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
-	// TODO(b/68878065): Only the dump variant of the types below are
-	// supported.
-	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
-		return syserr.ErrNotSupported
+// addNewLinkMessage appends RTM_NEWLINK message for the given interface into
+// the message set.
+func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) {
+	m := ms.AddMessage(linux.NetlinkMessageHeader{
+		Type: linux.RTM_NEWLINK,
+	})
+
+	m.Put(linux.InterfaceInfoMessage{
+		Family: linux.AF_UNSPEC,
+		Type:   i.DeviceType,
+		Index:  idx,
+		Flags:  i.Flags,
+	})
+
+	m.PutAttrString(linux.IFLA_IFNAME, i.Name)
+	m.PutAttr(linux.IFLA_MTU, i.MTU)
+
+	mac := make([]byte, 6)
+	brd := mac
+	if len(i.Addr) > 0 {
+		mac = i.Addr
+		brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
 	}
+	m.PutAttr(linux.IFLA_ADDRESS, mac)
+	m.PutAttr(linux.IFLA_BROADCAST, brd)
+
+	// TODO(gvisor.dev/issue/578): There are many more attributes.
+}
 
+// dumpAddrs handles RTM_GETADDR dump requests.
+func (p *Protocol) dumpAddrs(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETADDR dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -168,6 +222,7 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 				Index:     uint32(id),
 			})
 
+			m.PutAttr(linux.IFA_LOCAL, []byte(a.Addr))
 			m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr))
 
 			// TODO(gvisor.dev/issue/578): There are many more attributes.
@@ -252,12 +307,12 @@ func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) {
 }
 
 // parseForDestination parses a message as format of RouteMessage-RtAttr-dst.
-func parseForDestination(data []byte) ([]byte, *syserr.Error) {
+func parseForDestination(msg *netlink.Message) ([]byte, *syserr.Error) {
 	var rtMsg linux.RouteMessage
-	if len(data) < linux.SizeOfRouteMessage {
+	attrs, ok := msg.GetData(&rtMsg)
+	if !ok {
 		return nil, syserr.ErrInvalidArgument
 	}
-	binary.Unmarshal(data[:linux.SizeOfRouteMessage], usermem.ByteOrder, &rtMsg)
 	// iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See
 	// commit bc234301af12. Note we don't check this flag for backward
 	// compatibility.
@@ -265,26 +320,15 @@ func parseForDestination(data []byte) ([]byte, *syserr.Error) {
 		return nil, syserr.ErrNotSupported
 	}
 
-	data = data[linux.SizeOfRouteMessage:]
-
-	// TODO(gvisor.dev/issue/1611): Add generic attribute parsing.
-	var rtAttr linux.RtAttr
-	if len(data) < linux.SizeOfRtAttr {
-		return nil, syserr.ErrInvalidArgument
+	// Expect first attribute is RTA_DST.
+	if hdr, value, _, ok := attrs.ParseFirst(); ok && hdr.Type == linux.RTA_DST {
+		return value, nil
 	}
-	binary.Unmarshal(data[:linux.SizeOfRtAttr], usermem.ByteOrder, &rtAttr)
-	if rtAttr.Type != linux.RTA_DST {
-		return nil, syserr.ErrInvalidArgument
-	}
-
-	if len(data) < int(rtAttr.Len) {
-		return nil, syserr.ErrInvalidArgument
-	}
-	return data[linux.SizeOfRtAttr:rtAttr.Len], nil
+	return nil, syserr.ErrInvalidArgument
 }
 
 // dumpRoutes handles RTM_GETROUTE requests.
-func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) dumpRoutes(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// RTM_GETROUTE dump requests need not contain anything more than the
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
@@ -295,10 +339,11 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade
 		return nil
 	}
 
+	hdr := msg.Header()
 	routeTables := stack.RouteTable()
 
 	if hdr.Flags == linux.NLM_F_REQUEST {
-		dst, err := parseForDestination(data)
+		dst, err := parseForDestination(msg)
 		if err != nil {
 			return err
 		}
@@ -357,10 +402,55 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade
 	return nil
 }
 
+// newAddr handles RTM_NEWADDR requests.
+func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		// No network stack.
+		return syserr.ErrProtocolNotSupported
+	}
+
+	var ifa linux.InterfaceAddrMessage
+	attrs, ok := msg.GetData(&ifa)
+	if !ok {
+		return syserr.ErrInvalidArgument
+	}
+
+	for !attrs.Empty() {
+		ahdr, value, rest, ok := attrs.ParseFirst()
+		if !ok {
+			return syserr.ErrInvalidArgument
+		}
+		attrs = rest
+
+		switch ahdr.Type {
+		case linux.IFA_LOCAL:
+			err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{
+				Family:    ifa.Family,
+				PrefixLen: ifa.PrefixLen,
+				Flags:     ifa.Flags,
+				Addr:      value,
+			})
+			if err == syscall.EEXIST {
+				flags := msg.Header().Flags
+				if flags&linux.NLM_F_EXCL != 0 {
+					return syserr.ErrExists
+				}
+			} else if err != nil {
+				return syserr.ErrInvalidArgument
+			}
+		}
+	}
+	return nil
+}
+
 // ProcessMessage implements netlink.Protocol.ProcessMessage.
-func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
+	hdr := msg.Header()
+
 	// All messages start with a 1 byte protocol family.
-	if len(data) < 1 {
+	var family uint8
+	if _, ok := msg.GetData(&family); !ok {
 		// Linux ignores messages missing the protocol family. See
 		// net/core/rtnetlink.c:rtnetlink_rcv_msg.
 		return nil
@@ -374,16 +464,32 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH
 		}
 	}
 
-	switch hdr.Type {
-	case linux.RTM_GETLINK:
-		return p.dumpLinks(ctx, hdr, data, ms)
-	case linux.RTM_GETADDR:
-		return p.dumpAddrs(ctx, hdr, data, ms)
-	case linux.RTM_GETROUTE:
-		return p.dumpRoutes(ctx, hdr, data, ms)
-	default:
-		return syserr.ErrNotSupported
+	if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP {
+		// TODO(b/68878065): Only the dump variant of the types below are
+		// supported.
+		switch hdr.Type {
+		case linux.RTM_GETLINK:
+			return p.dumpLinks(ctx, msg, ms)
+		case linux.RTM_GETADDR:
+			return p.dumpAddrs(ctx, msg, ms)
+		case linux.RTM_GETROUTE:
+			return p.dumpRoutes(ctx, msg, ms)
+		default:
+			return syserr.ErrNotSupported
+		}
+	} else if hdr.Flags&linux.NLM_F_REQUEST == linux.NLM_F_REQUEST {
+		switch hdr.Type {
+		case linux.RTM_GETLINK:
+			return p.getLink(ctx, msg, ms)
+		case linux.RTM_GETROUTE:
+			return p.dumpRoutes(ctx, msg, ms)
+		case linux.RTM_NEWADDR:
+			return p.newAddr(ctx, msg, ms)
+		default:
+			return syserr.ErrNotSupported
+		}
 	}
+	return syserr.ErrNotSupported
 }
 
 // init registers the NETLINK_ROUTE provider.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index c4b95debb..2ca02567d 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -644,47 +644,38 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 	return nil
 }
 
-func (s *Socket) dumpErrorMesage(ctx context.Context, hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) *syserr.Error {
+func dumpErrorMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) {
 	m := ms.AddMessage(linux.NetlinkMessageHeader{
 		Type: linux.NLMSG_ERROR,
 	})
-
 	m.Put(linux.NetlinkErrorMessage{
 		Error:  int32(-err.ToLinux().Number()),
 		Header: hdr,
 	})
-	return nil
+}
 
+func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) {
+	m := ms.AddMessage(linux.NetlinkMessageHeader{
+		Type: linux.NLMSG_ERROR,
+	})
+	m.Put(linux.NetlinkErrorMessage{
+		Error:  0,
+		Header: hdr,
+	})
 }
 
 // processMessages handles each message in buf, passing it to the protocol
 // handler for final handling.
 func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error {
 	for len(buf) > 0 {
-		if len(buf) < linux.NetlinkMessageHeaderSize {
+		msg, rest, ok := ParseMessage(buf)
+		if !ok {
 			// Linux ignores messages that are too short. See
 			// net/netlink/af_netlink.c:netlink_rcv_skb.
 			break
 		}
-
-		var hdr linux.NetlinkMessageHeader
-		binary.Unmarshal(buf[:linux.NetlinkMessageHeaderSize], usermem.ByteOrder, &hdr)
-
-		if hdr.Length < linux.NetlinkMessageHeaderSize || uint64(hdr.Length) > uint64(len(buf)) {
-			// Linux ignores malformed messages. See
-			// net/netlink/af_netlink.c:netlink_rcv_skb.
-			break
-		}
-
-		// Data from this message.
-		data := buf[linux.NetlinkMessageHeaderSize:hdr.Length]
-
-		// Advance to the next message.
-		next := alignUp(int(hdr.Length), linux.NLMSG_ALIGNTO)
-		if next >= len(buf)-1 {
-			next = len(buf) - 1
-		}
-		buf = buf[next:]
+		buf = rest
+		hdr := msg.Header()
 
 		// Ignore control messages.
 		if hdr.Type < linux.NLMSG_MIN_TYPE {
@@ -692,19 +683,10 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
 		}
 
 		ms := NewMessageSet(s.portID, hdr.Seq)
-		var err *syserr.Error
-		// TODO(b/68877377): ACKs not supported yet.
-		if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
-			err = syserr.ErrNotSupported
-		} else {
-
-			err = s.protocol.ProcessMessage(ctx, hdr, data, ms)
-		}
-		if err != nil {
-			ms = NewMessageSet(s.portID, hdr.Seq)
-			if err := s.dumpErrorMesage(ctx, hdr, ms, err); err != nil {
-				return err
-			}
+		if err := s.protocol.ProcessMessage(ctx, msg, ms); err != nil {
+			dumpErrorMesage(hdr, ms, err)
+		} else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
+			dumpAckMesage(hdr, ms)
 		}
 
 		if err := s.sendResponse(ctx, ms); err != nil {
diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go
index 1ee4296bc..029ba21b5 100644
--- a/pkg/sentry/socket/netlink/uevent/protocol.go
+++ b/pkg/sentry/socket/netlink/uevent/protocol.go
@@ -49,7 +49,7 @@ func (p *Protocol) CanSend() bool {
 }
 
 // ProcessMessage implements netlink.Protocol.ProcessMessage.
-func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
 	// Silently ignore all messages.
 	return nil
 }
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index 31ea66eca..0692482e9 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -20,6 +20,8 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/iptables"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -88,6 +90,59 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 	return nicAddrs
 }
 
+// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
+func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
+	var (
+		protocol tcpip.NetworkProtocolNumber
+		address  tcpip.Address
+	)
+	switch addr.Family {
+	case linux.AF_INET:
+		if len(addr.Addr) < header.IPv4AddressSize {
+			return syserror.EINVAL
+		}
+		if addr.PrefixLen > header.IPv4AddressSize*8 {
+			return syserror.EINVAL
+		}
+		protocol = ipv4.ProtocolNumber
+		address = tcpip.Address(addr.Addr[:header.IPv4AddressSize])
+
+	case linux.AF_INET6:
+		if len(addr.Addr) < header.IPv6AddressSize {
+			return syserror.EINVAL
+		}
+		if addr.PrefixLen > header.IPv6AddressSize*8 {
+			return syserror.EINVAL
+		}
+		protocol = ipv6.ProtocolNumber
+		address = tcpip.Address(addr.Addr[:header.IPv6AddressSize])
+
+	default:
+		return syserror.ENOTSUP
+	}
+
+	protocolAddress := tcpip.ProtocolAddress{
+		Protocol: protocol,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   address,
+			PrefixLen: int(addr.PrefixLen),
+		},
+	}
+
+	// Attach address to interface.
+	if err := s.Stack.AddProtocolAddressWithOptions(tcpip.NICID(idx), protocolAddress, stack.CanBePrimaryEndpoint); err != nil {
+		return syserr.TranslateNetstackError(err).ToError()
+	}
+
+	// Add route for local network.
+	s.Stack.AddRoute(tcpip.Route{
+		Destination: protocolAddress.AddressWithPrefix.Subnet(),
+		Gateway:     "", // No gateway for local network.
+		NIC:         tcpip.NICID(idx),
+	})
+	return nil
+}
+
 // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
 func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
 	var rs tcp.ReceiveBufferSizeOption
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 7057b110e..b793f1d74 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -795,6 +795,8 @@ func (s *Stack) Forwarding() bool {
 
 // SetRouteTable assigns the route table to be used by this stack. It
 // specifies which NIC to use for given destination address ranges.
+//
+// This method takes ownership of the table.
 func (s *Stack) SetRouteTable(table []tcpip.Route) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -809,6 +811,13 @@ func (s *Stack) GetRouteTable() []tcpip.Route {
 	return append([]tcpip.Route(nil), s.routeTable...)
 }
 
+// AddRoute appends a route to the route table.
+func (s *Stack) AddRoute(route tcpip.Route) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.routeTable = append(s.routeTable, route)
+}
+
 // NewEndpoint creates a new transport layer endpoint of the given protocol.
 func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	t, ok := s.transportProtocols[transport]
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 273b014d6..f2e3c7072 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2769,9 +2769,11 @@ cc_binary(
     deps = [
         ":socket_netlink_util",
         ":socket_test_util",
+        "//test/util:capability_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
         gtest,
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 1e28e658d..e5aed1eec 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -14,6 +14,7 @@
 
 #include <arpa/inet.h>
 #include <ifaddrs.h>
+#include <linux/if.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 #include <sys/socket.h>
@@ -25,8 +26,10 @@
 
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/optional.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
 #include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/test_util.h"
@@ -38,6 +41,8 @@ namespace testing {
 
 namespace {
 
+constexpr uint32_t kSeq = 12345;
+
 using ::testing::AnyOf;
 using ::testing::Eq;
 
@@ -113,58 +118,224 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
   // TODO(mpratt): Check ifinfomsg contents and following attrs.
 }
 
+PosixError DumpLinks(
+    const FileDescriptor& fd, uint32_t seq,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = seq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
+}
+
 TEST(NetlinkRouteTest, GetLinkDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
 
+  // Loopback is common among all tests, check that it's found.
+  bool loopbackFound = false;
+  ASSERT_NO_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    CheckGetLinkResponse(hdr, kSeq, port);
+    if (hdr->nlmsg_type != RTM_NEWLINK) {
+      return;
+    }
+    ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    std::cout << "Found interface idx=" << msg->ifi_index
+              << ", type=" << std::hex << msg->ifi_type;
+    if (msg->ifi_type == ARPHRD_LOOPBACK) {
+      loopbackFound = true;
+      EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
+    }
+  }));
+  EXPECT_TRUE(loopbackFound);
+}
+
+struct Link {
+  int index;
+  std::string name;
+};
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  absl::optional<Link> link;
+  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    if (hdr->nlmsg_type != RTM_NEWLINK ||
+        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
+      return;
+    }
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    if (msg->ifi_type == ARPHRD_LOOPBACK) {
+      const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+      if (rta == nullptr) {
+        // Ignore links that do not have a name.
+        return;
+      }
+
+      link = Link();
+      link->index = msg->ifi_index;
+      link->name = std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
+    }
+  }));
+  return link;
+}
+
+// CheckLinkMsg checks a netlink message against an expected link.
+void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
+  ASSERT_THAT(hdr->nlmsg_type, Eq(RTM_NEWLINK));
+  ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+  const struct ifinfomsg* msg =
+      reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+  EXPECT_EQ(msg->ifi_index, link.index);
+
+  const struct rtattr* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+  EXPECT_NE(nullptr, rta) << "IFLA_IFNAME not found in message.";
+  if (rta != nullptr) {
+    std::string name(reinterpret_cast<const char*>(RTA_DATA(rta)));
+    EXPECT_EQ(name, link.name);
+  }
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndex) {
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
+  req.ifm.ifi_index = loopback_link->index;
 
-  // Loopback is common among all tests, check that it's found.
-  bool loopbackFound = false;
+  bool found = false;
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
       [&](const struct nlmsghdr* hdr) {
-        CheckGetLinkResponse(hdr, kSeq, port);
-        if (hdr->nlmsg_type != RTM_NEWLINK) {
-          return;
-        }
-        ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
-        const struct ifinfomsg* msg =
-            reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
-        std::cout << "Found interface idx=" << msg->ifi_index
-                  << ", type=" << std::hex << msg->ifi_type;
-        if (msg->ifi_type == ARPHRD_LOOPBACK) {
-          loopbackFound = true;
-          EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
-        }
+        CheckLinkMsg(hdr, *loopback_link);
+        found = true;
       },
       false));
-  EXPECT_TRUE(loopbackFound);
+  EXPECT_TRUE(found) << "Netlink response does not contain any links.";
 }
 
-TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
+TEST(NetlinkRouteTest, GetLinkByName) {
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
 
   struct request {
     struct nlmsghdr hdr;
     struct ifinfomsg ifm;
+    struct rtattr rtattr;
+    char ifname[IFNAMSIZ];
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
   };
 
-  constexpr uint32_t kSeq = 12345;
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.rtattr.rta_type = IFLA_IFNAME;
+  req.rtattr.rta_len = RTA_LENGTH(loopback_link->name.size() + 1);
+  strncpy(req.ifname, loopback_link->name.c_str(), sizeof(req.ifname));
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  bool found = false;
+  ASSERT_NO_ERRNO(NetlinkRequestResponse(
+      fd, &req, sizeof(req),
+      [&](const struct nlmsghdr* hdr) {
+        CheckLinkMsg(hdr, *loopback_link);
+        found = true;
+      },
+      false));
+  EXPECT_TRUE(found) << "Netlink response does not contain any links.";
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndexNotFound) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.ifm.ifi_index = 1234590;
+
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, GetLinkByNameNotFound) {
+  const std::string name = "nodevice?!";
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+    struct rtattr rtattr;
+    char ifname[IFNAMSIZ];
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+  req.rtattr.rta_type = IFLA_IFNAME;
+  req.rtattr.rta_len = RTA_LENGTH(name.size() + 1);
+  strncpy(req.ifname, name.c_str(), sizeof(req.ifname));
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
 
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
@@ -175,18 +346,8 @@ TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
 
-  ASSERT_NO_ERRNO(NetlinkRequestResponse(
-      fd, &req, sizeof(req),
-      [&](const struct nlmsghdr* hdr) {
-        EXPECT_THAT(hdr->nlmsg_type, Eq(NLMSG_ERROR));
-        EXPECT_EQ(hdr->nlmsg_seq, kSeq);
-        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr));
-
-        const struct nlmsgerr* msg =
-            reinterpret_cast<const struct nlmsgerr*>(NLMSG_DATA(hdr));
-        EXPECT_EQ(msg->error, -EOPNOTSUPP);
-      },
-      true));
+  EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+              PosixErrorIs(EOPNOTSUPP, ::testing::_));
 }
 
 TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
@@ -198,8 +359,6 @@ TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
@@ -238,8 +397,6 @@ TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETLINK;
@@ -282,8 +439,6 @@ TEST(NetlinkRouteTest, ControlMessageIgnored) {
     struct ifinfomsg ifm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
 
   // This control message is ignored. We still receive a response for the
@@ -317,8 +472,6 @@ TEST(NetlinkRouteTest, GetAddrDump) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -367,6 +520,57 @@ TEST(NetlinkRouteTest, LookupAll) {
   ASSERT_GT(count, 0);
 }
 
+TEST(NetlinkRouteTest, AddAddr) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  absl::optional<Link> loopback_link =
+      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
+  ASSERT_TRUE(loopback_link.has_value());
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifaddrmsg ifa;
+    struct rtattr rtattr;
+    struct in_addr addr;
+    char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_type = RTM_NEWADDR;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifa.ifa_family = AF_INET;
+  req.ifa.ifa_prefixlen = 24;
+  req.ifa.ifa_flags = 0;
+  req.ifa.ifa_scope = 0;
+  req.ifa.ifa_index = loopback_link->index;
+  req.rtattr.rta_type = IFA_LOCAL;
+  req.rtattr.rta_len = RTA_LENGTH(sizeof(req.addr));
+  inet_pton(AF_INET, "10.0.0.1", &req.addr);
+  req.hdr.nlmsg_len =
+      NLMSG_LENGTH(sizeof(req.ifa)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+  // Create should succeed, as no such address in kernel.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
+  EXPECT_NO_ERRNO(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+  // Replace an existing address should succeed.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_REPLACE | NLM_F_ACK;
+  req.hdr.nlmsg_seq++;
+  EXPECT_NO_ERRNO(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+  // Create exclusive should fail, as we created the address above.
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL | NLM_F_ACK;
+  req.hdr.nlmsg_seq++;
+  EXPECT_THAT(
+      NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len),
+      PosixErrorIs(EEXIST, ::testing::_));
+}
+
 // GetRouteDump tests a RTM_GETROUTE + NLM_F_DUMP request.
 TEST(NetlinkRouteTest, GetRouteDump) {
   FileDescriptor fd =
@@ -378,8 +582,6 @@ TEST(NetlinkRouteTest, GetRouteDump) {
     struct rtmsg rtm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req = {};
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETROUTE;
@@ -538,8 +740,6 @@ TEST(NetlinkRouteTest, RecvmsgTrunc) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -615,8 +815,6 @@ TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -695,8 +893,6 @@ TEST(NetlinkRouteTest, NoPasscredNoCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
@@ -743,8 +939,6 @@ TEST(NetlinkRouteTest, PasscredCreds) {
     struct rtgenmsg rgm;
   };
 
-  constexpr uint32_t kSeq = 12345;
-
   struct request req;
   req.hdr.nlmsg_len = sizeof(req);
   req.hdr.nlmsg_type = RTM_GETADDR;
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index cd2212a1a..952eecfe8 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -16,6 +16,7 @@
 
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <linux/rtnetlink.h>
 #include <sys/socket.h>
 
 #include <vector>
@@ -71,9 +72,10 @@ PosixError NetlinkRequestResponse(
   iov.iov_base = buf.data();
   iov.iov_len = buf.size();
 
-  // Response is a series of NLM_F_MULTI messages, ending with a NLMSG_DONE
-  // message.
+  // If NLM_F_MULTI is set, response is a series of messages that ends with a
+  // NLMSG_DONE message.
   int type = -1;
+  int flags = 0;
   do {
     int len;
     RETURN_ERROR_IF_SYSCALL_FAIL(len = RetryEINTR(recvmsg)(fd.get(), &msg, 0));
@@ -89,6 +91,7 @@ PosixError NetlinkRequestResponse(
     for (struct nlmsghdr* hdr = reinterpret_cast<struct nlmsghdr*>(buf.data());
          NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
       fn(hdr);
+      flags = hdr->nlmsg_flags;
       type = hdr->nlmsg_type;
       // Done should include an integer payload for dump_done_errno.
       // See net/netlink/af_netlink.c:netlink_dump
@@ -98,11 +101,11 @@ PosixError NetlinkRequestResponse(
         EXPECT_GE(hdr->nlmsg_len, NLMSG_LENGTH(sizeof(int)));
       }
     }
-  } while (type != NLMSG_DONE && type != NLMSG_ERROR);
+  } while ((flags & NLM_F_MULTI) && type != NLMSG_DONE && type != NLMSG_ERROR);
 
   if (expect_nlmsgerr) {
     EXPECT_EQ(type, NLMSG_ERROR);
-  } else {
+  } else if (flags & NLM_F_MULTI) {
     EXPECT_EQ(type, NLMSG_DONE);
   }
   return NoError();
@@ -146,5 +149,39 @@ PosixError NetlinkRequestResponseSingle(
   return NoError();
 }
 
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+                                    void* request, size_t len) {
+  // Dummy negative number for no error message received.
+  // We won't get a negative error number so there will be no confusion.
+  int err = -42;
+  RETURN_IF_ERRNO(NetlinkRequestResponse(
+      fd, request, len,
+      [&](const struct nlmsghdr* hdr) {
+        EXPECT_EQ(NLMSG_ERROR, hdr->nlmsg_type);
+        EXPECT_EQ(hdr->nlmsg_seq, seq);
+        EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr));
+
+        const struct nlmsgerr* msg =
+            reinterpret_cast<const struct nlmsgerr*>(NLMSG_DATA(hdr));
+        err = -msg->error;
+      },
+      true));
+  return PosixError(err);
+}
+
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+                                const struct ifinfomsg* msg, int16_t attr) {
+  const int ifi_space = NLMSG_SPACE(sizeof(*msg));
+  int attrlen = hdr->nlmsg_len - ifi_space;
+  const struct rtattr* rta = reinterpret_cast<const struct rtattr*>(
+      reinterpret_cast<const uint8_t*>(hdr) + NLMSG_ALIGN(ifi_space));
+  for (; RTA_OK(rta, attrlen); rta = RTA_NEXT(rta, attrlen)) {
+    if (rta->rta_type == attr) {
+      return rta;
+    }
+  }
+  return nullptr;
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index 3678c0599..e13ead406 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -19,6 +19,7 @@
 // socket.h has to be included before if_arp.h.
 #include <linux/if_arp.h>
 #include <linux/netlink.h>
+#include <linux/rtnetlink.h>
 
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
@@ -47,6 +48,14 @@ PosixError NetlinkRequestResponseSingle(
     const FileDescriptor& fd, void* request, size_t len,
     const std::function<void(const struct nlmsghdr* hdr)>& fn);
 
+// Send the passed request then expect and return an ack or error.
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+                                    void* request, size_t len);
+
+// Find rtnetlink attribute in message.
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+                                const struct ifinfomsg* msg, int16_t attr);
+
 }  // namespace testing
 }  // namespace gvisor
 
-- 
cgit v1.2.3


From eea0eeee933ba8406ae688fce4348271f9513514 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 5 Feb 2020 11:25:10 -0800
Subject: Disable get/set xattrs until list/remove exist too.

PiperOrigin-RevId: 293411655
---
 pkg/sentry/syscalls/linux/linux64_amd64.go |  27 ++++---
 pkg/sentry/syscalls/linux/linux64_arm64.go |  37 +++++----
 test/syscalls/linux/xattr.cc               | 124 +++++++++++++++++++++++++++++
 3 files changed, 159 insertions(+), 29 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 7435b50bf..588f8b087 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,18 +228,21 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
-		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
-		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
-		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
-		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		// TODO(b/148303075): Enable set/getxattr (in their various
+		// forms) once we also have list and removexattr. The JVM
+		// assumes that if get/set exist, then list and remove do too.
+		188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
 		200: syscalls.Supported("tkill", Tkill),
 		201: syscalls.Supported("time", Time),
 		202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 03a39fe65..06e5ee401 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -36,23 +36,26 @@ var ARM64 = &kernel.SyscallTable{
 	},
 	AuditNumber: linux.AUDIT_ARCH_AARCH64,
 	Table: map[uintptr]kernel.Syscall{
-		0:   syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		1:   syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
-		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
-		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
-		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
-		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		12:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		// TODO(b/148303075): Enable set/getxattr (in their various
+		// forms) once we also have list and removexattr. The JVM
+		// assumes that if get/set exist, then list and remove do too.
+		5:   syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		6:   syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		7:   syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		8:   syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		13:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
 		17:  syscalls.Supported("getcwd", Getcwd),
 		18:  syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
 		19:  syscalls.Supported("eventfd2", Eventfd2),
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index ab21d68c6..85eb31847 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -39,6 +39,10 @@ namespace {
 class XattrTest : public FileTest {};
 
 TEST_F(XattrTest, XattrNullName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
@@ -48,6 +52,10 @@ TEST_F(XattrTest, XattrNullName) {
 }
 
 TEST_F(XattrTest, XattrEmptyName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
@@ -56,6 +64,10 @@ TEST_F(XattrTest, XattrEmptyName) {
 }
 
 TEST_F(XattrTest, XattrLargeName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
@@ -77,6 +89,10 @@ TEST_F(XattrTest, XattrLargeName) {
 }
 
 TEST_F(XattrTest, XattrInvalidPrefix) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   std::string name(XATTR_NAME_MAX, 'a');
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
@@ -88,6 +104,10 @@ TEST_F(XattrTest, XattrInvalidPrefix) {
 // Do not allow save/restore cycles after making the test file read-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -113,6 +133,10 @@ TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
 // Do not allow save/restore cycles after making the test file write-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -143,6 +167,10 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) {
 }
 
 TEST_F(XattrTest, XattrOnDirectory) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
@@ -152,6 +180,10 @@ TEST_F(XattrTest, XattrOnDirectory) {
 }
 
 TEST_F(XattrTest, XattrOnSymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
@@ -163,6 +195,10 @@ TEST_F(XattrTest, XattrOnSymlink) {
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char name[] = "user.test";
 
   char char_device[] = "/dev/zero";
@@ -181,6 +217,10 @@ TEST_F(XattrTest, XattrOnInvalidFileTypes) {
 }
 
 TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -196,6 +236,10 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, SetxattrZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -208,6 +252,10 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
 
@@ -223,6 +271,10 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
@@ -232,6 +284,10 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -240,6 +296,10 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
@@ -256,6 +316,10 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -271,6 +335,10 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithLarger) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -285,6 +353,10 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
 }
 
 TEST_F(XattrTest, SetxattrCreateFlag) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -296,6 +368,10 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceFlag) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
@@ -308,6 +384,10 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
 }
 
 TEST_F(XattrTest, SetxattrInvalidFlags) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
@@ -315,6 +395,10 @@ TEST_F(XattrTest, SetxattrInvalidFlags) {
 }
 
 TEST_F(XattrTest, Getxattr) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -327,6 +411,10 @@ TEST_F(XattrTest, Getxattr) {
 }
 
 TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -339,6 +427,10 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -354,6 +446,10 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -367,6 +463,10 @@ TEST_F(XattrTest, GetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrSizeTooLarge) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -383,6 +483,10 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, GetxattrNullValue) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -394,6 +498,10 @@ TEST_F(XattrTest, GetxattrNullValue) {
 }
 
 TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -410,12 +518,20 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrNonexistentName) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, LGetSetxattrOnSymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
@@ -427,6 +543,10 @@ TEST_F(XattrTest, LGetSetxattrOnSymlink) {
 }
 
 TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -441,6 +561,10 @@ TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
 }
 
 TEST_F(XattrTest, FGetSetxattr) {
+  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
+  // supported, and get/set have been added pack to the syscall table.
+  SKIP_IF(IsRunningOnGvisor());
+
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
   const char name[] = "user.test";
-- 
cgit v1.2.3


From f3d95607036b8a502c65aa7b3e8145227274dbbc Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Wed, 5 Feb 2020 17:56:00 -0800
Subject: recv() on a closed TCP socket returns ENOTCONN

From RFC 793 s3.9 p58 Event Processing:

If RECEIVE Call arrives in CLOSED state and the user has access to such a
connection, the return should be "error: connection does not exist"

Fixes #1598

PiperOrigin-RevId: 293494287
---
 pkg/sentry/socket/netstack/netstack.go | 7 ++++++-
 pkg/tcpip/tcpip.go                     | 4 ++++
 pkg/tcpip/transport/tcp/endpoint.go    | 4 ++--
 pkg/tcpip/transport/tcp/tcp_test.go    | 9 ++++-----
 test/syscalls/linux/tcp_socket.cc      | 9 +++++++++
 5 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 049d04bf2..ed2fbcceb 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -2229,11 +2229,16 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 	var copied int
 
 	// Copy as many views as possible into the user-provided buffer.
-	for dst.NumBytes() != 0 {
+	for {
+		// Always do at least one fetchReadView, even if the number of bytes to
+		// read is 0.
 		err = s.fetchReadView()
 		if err != nil {
 			break
 		}
+		if dst.NumBytes() == 0 {
+			break
+		}
 
 		var n int
 		var e error
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 0fa141d58..d29d9a704 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1124,6 +1124,10 @@ type ReadErrors struct {
 	// InvalidEndpointState is the number of times we found the endpoint state
 	// to be unexpected.
 	InvalidEndpointState StatCounter
+
+	// NotConnected is the number of times we tried to read but found that the
+	// endpoint was not connected.
+	NotConnected StatCounter
 }
 
 // WriteErrors collects packet write errors from an endpoint write call.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index b5a8e15ee..e4a6b1b8b 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1003,8 +1003,8 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 		if s == StateError {
 			return buffer.View{}, tcpip.ControlMessages{}, he
 		}
-		e.stats.ReadErrors.InvalidEndpointState.Increment()
-		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
+		e.stats.ReadErrors.NotConnected.Increment()
+		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected
 	}
 
 	v, err := e.readLocked()
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 2c1505067..cc118c993 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -5405,12 +5405,11 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
 	}
 
-	// Expect InvalidEndpointState errors on a read at this point.
-	if _, _, err := ep.Read(nil); err != tcpip.ErrInvalidEndpointState {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrInvalidEndpointState)
+	if _, _, err := ep.Read(nil); err != tcpip.ErrNotConnected {
+		t.Errorf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrNotConnected)
 	}
-	if got := ep.Stats().(*tcp.Stats).ReadErrors.InvalidEndpointState.Value(); got != 1 {
-		t.Fatalf("got EP stats Stats.ReadErrors.InvalidEndpointState got %v want %v", got, 1)
+	if got := ep.Stats().(*tcp.Stats).ReadErrors.NotConnected.Value(); got != 1 {
+		t.Errorf("got EP stats Stats.ReadErrors.NotConnected got %v want %v", got, 1)
 	}
 
 	if err := ep.Listen(10); err != nil {
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 525ccbd88..8a8b68e75 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1339,6 +1339,15 @@ TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) {
   EXPECT_EQ(get, kTCPDeferAccept);
 }
 
+TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) {
+  auto s =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+  char buf[1];
+  EXPECT_THAT(recv(s.get(), buf, 0, 0), SyscallFailsWithErrno(ENOTCONN));
+  EXPECT_THAT(recv(s.get(), buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(ENOTCONN));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 1b6a12a768216a99a5e0428c42ea4faf79cf3b50 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 5 Feb 2020 22:45:44 -0800
Subject: Add notes to relevant tests.

These were out-of-band notes that can help provide additional context
and simplify automated imports.

PiperOrigin-RevId: 293525915
---
 pkg/metric/metric.go                          |  1 -
 pkg/sentry/arch/arch_x86.go                   |  4 ++
 pkg/sentry/arch/signal_amd64.go               |  2 +-
 pkg/sentry/fs/file_overlay_test.go            |  1 +
 pkg/sentry/fs/proc/README.md                  |  4 ++
 pkg/sentry/kernel/BUILD                       |  1 +
 pkg/sentry/kernel/kernel.go                   |  3 ++
 pkg/sentry/kernel/kernel_opts.go              | 20 +++++++
 pkg/sentry/socket/hostinet/BUILD              |  1 +
 pkg/sentry/socket/hostinet/socket.go          |  5 +-
 pkg/sentry/socket/hostinet/sockopt_impl.go    | 27 ++++++++++
 pkg/tcpip/transport/tcp/endpoint.go           |  3 ++
 runsc/boot/filter/BUILD                       |  1 +
 runsc/boot/filter/config.go                   | 13 -----
 runsc/boot/filter/config_profile.go           | 34 ++++++++++++
 runsc/container/console_test.go               |  5 +-
 runsc/dockerutil/dockerutil.go                | 11 ++--
 runsc/testutil/BUILD                          |  5 +-
 runsc/testutil/testutil.go                    | 54 -------------------
 runsc/testutil/testutil_runfiles.go           | 75 +++++++++++++++++++++++++++
 test/image/image_test.go                      |  8 +--
 test/syscalls/build_defs.bzl                  | 35 +++++++++++--
 test/syscalls/linux/chroot.cc                 |  2 +-
 test/syscalls/linux/concurrency.cc            |  3 +-
 test/syscalls/linux/exec_proc_exe_workload.cc |  6 +++
 test/syscalls/linux/fork.cc                   |  5 +-
 test/syscalls/linux/mmap.cc                   |  8 +--
 test/syscalls/linux/open_create.cc            |  1 +
 test/syscalls/linux/preadv.cc                 |  1 +
 test/syscalls/linux/proc.cc                   | 46 +++++++++++++---
 test/syscalls/linux/readv.cc                  |  4 +-
 test/syscalls/linux/rseq.cc                   |  2 +-
 test/syscalls/linux/select.cc                 |  2 +-
 test/syscalls/linux/shm.cc                    |  2 +-
 test/syscalls/linux/sigprocmask.cc            |  2 +-
 test/syscalls/linux/socket_unix_non_stream.cc |  4 +-
 test/syscalls/linux/symlink.cc                |  2 +-
 test/syscalls/linux/tcp_socket.cc             |  3 +-
 test/syscalls/linux/time.cc                   |  1 +
 test/syscalls/linux/tkill.cc                  |  2 +-
 test/util/temp_path.cc                        |  1 +
 tools/build/tags.bzl                          |  4 ++
 tools/defs.bzl                                | 17 +++++-
 43 files changed, 318 insertions(+), 113 deletions(-)
 create mode 100644 pkg/sentry/kernel/kernel_opts.go
 create mode 100644 pkg/sentry/socket/hostinet/sockopt_impl.go
 create mode 100644 runsc/boot/filter/config_profile.go
 create mode 100644 runsc/testutil/testutil_runfiles.go

(limited to 'test/syscalls/linux')

diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 93d4f2b8c..006fcd9ab 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -46,7 +46,6 @@ var (
 //
 // TODO(b/67298402): Support non-cumulative metrics.
 // TODO(b/67298427): Support metric fields.
-//
 type Uint64Metric struct {
 	// value is the actual value of the metric. It must be accessed
 	// atomically.
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index a18093155..3db8bd34b 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -114,6 +114,10 @@ func newX86FPStateSlice() []byte {
 	size, align := cpuid.HostFeatureSet().ExtendedStateSize()
 	capacity := size
 	// Always use at least 4096 bytes.
+	//
+	// For the KVM platform, this state is a fixed 4096 bytes, so make sure
+	// that the underlying array is at _least_ that size otherwise we will
+	// corrupt random memory. This is not a pleasant thing to debug.
 	if capacity < 4096 {
 		capacity = 4096
 	}
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 81b92bb43..6fb756f0e 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -55,7 +55,7 @@ type SignalContext64 struct {
 	Trapno  uint64
 	Oldmask linux.SignalSet
 	Cr2     uint64
-	// Pointer to a struct _fpstate.
+	// Pointer to a struct _fpstate. See b/33003106#comment8.
 	Fpstate  uint64
 	Reserved [8]uint64
 }
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index 02538bb4f..a76d87e3a 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -177,6 +177,7 @@ func TestReaddirRevalidation(t *testing.T) {
 
 // TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with
 // a frozen dirent tree does not make Readdir calls to the underlying files.
+// This is a regression test for b/114808269.
 func TestReaddirOverlayFrozen(t *testing.T) {
 	ctx := contexttest.Context(t)
 
diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index 5d4ec6c7b..6667a0916 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -11,6 +11,8 @@ inconsistency, please file a bug.
 
 The following files are implemented:
 
+<!-- mdformat off(don't wrap the table) -->
+
 | File /proc/                 | Content                                               |
 | :------------------------   | :---------------------------------------------------- |
 | [cpuinfo](#cpuinfo)         | Info about the CPU                                    |
@@ -22,6 +24,8 @@ The following files are implemented:
 | [uptime](#uptime)           | Wall clock since boot, combined idle time of all cpus |
 | [version](#version)         | Kernel version                                        |
 
+<!-- mdformat on -->
+
 ### cpuinfo
 
 ```bash
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index a27628c0a..2231d6973 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -91,6 +91,7 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
+        "kernel_opts.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index dcd6e91c4..3ee760ba2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -235,6 +235,9 @@ type Kernel struct {
 	// events. This is initialized lazily on the first unimplemented
 	// syscall.
 	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
+
+	// SpecialOpts contains special kernel options.
+	SpecialOpts
 }
 
 // InitKernelArgs holds arguments to Init.
diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go
new file mode 100644
index 000000000..2e66ec587
--- /dev/null
+++ b/pkg/sentry/kernel/kernel_opts.go
@@ -0,0 +1,20 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// SpecialOpts contains non-standard options for the kernel.
+//
+// +stateify savable
+type SpecialOpts struct{}
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 5a07d5d0e..023bad156 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -10,6 +10,7 @@ go_library(
         "save_restore.go",
         "socket.go",
         "socket_unsafe.go",
+        "sockopt_impl.go",
         "stack.go",
     ],
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 34f63986f..de76388ac 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -285,7 +285,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	}
 
 	// Whitelist options and constrain option length.
-	var optlen int
+	optlen := getSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
 		switch name {
@@ -330,7 +330,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 // SetSockOpt implements socket.Socket.SetSockOpt.
 func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
 	// Whitelist options and constrain option length.
-	var optlen int
+	optlen := setSockOptLen(t, level, name)
 	switch level {
 	case linux.SOL_IP:
 		switch name {
@@ -353,6 +353,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 			optlen = sizeofInt32
 		}
 	}
+
 	if optlen == 0 {
 		// Pretend to accept socket options we don't understand. This seems
 		// dangerous, but it's what netstack does...
diff --git a/pkg/sentry/socket/hostinet/sockopt_impl.go b/pkg/sentry/socket/hostinet/sockopt_impl.go
new file mode 100644
index 000000000..8a783712e
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/sockopt_impl.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostinet
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+func getSockOptLen(t *kernel.Task, level, name int) int {
+	return 0 // No custom options.
+}
+
+func setSockOptLen(t *kernel.Task, level, name int) int {
+	return 0 // No custom options.
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index e4a6b1b8b..f2be0e651 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2166,6 +2166,9 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	e.isRegistered = true
 	e.setEndpointState(StateListen)
 
+	// The channel may be non-nil when we're restoring the endpoint, and it
+	// may be pre-populated with some previously accepted (but not Accepted)
+	// endpoints.
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index ce30f6c53..ed18f0047 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -8,6 +8,7 @@ go_library(
         "config.go",
         "config_amd64.go",
         "config_arm64.go",
+        "config_profile.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index f8d351c7b..c69f4c602 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -536,16 +536,3 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 		},
 	}
 }
-
-// profileFilters returns extra syscalls made by runtime/pprof package.
-func profileFilters() seccomp.SyscallRules {
-	return seccomp.SyscallRules{
-		syscall.SYS_OPENAT: []seccomp.Rule{
-			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
-			},
-		},
-	}
-}
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
new file mode 100644
index 000000000..194952a7b
--- /dev/null
+++ b/runsc/boot/filter/config_profile.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_OPENAT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+			},
+		},
+	}
+}
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 060b63bf3..c2518d52b 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -196,7 +196,10 @@ func TestJobControlSignalExec(t *testing.T) {
 	defer ptyMaster.Close()
 	defer ptySlave.Close()
 
-	// Exec bash and attach a terminal.
+	// Exec bash and attach a terminal. Note that occasionally /bin/sh
+	// may be a different shell or have a different configuration (such
+	// as disabling interactive mode and job control). Since we want to
+	// explicitly test interactive mode, use /bin/bash. See b/116981926.
 	execArgs := &control.ExecArgs{
 		Filename: "/bin/bash",
 		// Don't let bash execute from profile or rc files, otherwise
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
index 9b6346ca2..1ff5e8cc3 100644
--- a/runsc/dockerutil/dockerutil.go
+++ b/runsc/dockerutil/dockerutil.go
@@ -143,8 +143,11 @@ func PrepareFiles(names ...string) (string, error) {
 		return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err)
 	}
 	for _, name := range names {
-		src := getLocalPath(name)
-		dst := path.Join(dir, name)
+		src, err := testutil.FindFile(name)
+		if err != nil {
+			return "", fmt.Errorf("testutil.Preparefiles(%q) failed: %v", name, err)
+		}
+		dst := path.Join(dir, path.Base(name))
 		if err := testutil.Copy(src, dst); err != nil {
 			return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err)
 		}
@@ -152,10 +155,6 @@ func PrepareFiles(names ...string) (string, error) {
 	return dir, nil
 }
 
-func getLocalPath(file string) string {
-	return path.Join(".", file)
-}
-
 // do executes docker command.
 func do(args ...string) (string, error) {
 	log.Printf("Running: docker %s\n", args)
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index f845120b0..945405303 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -5,7 +5,10 @@ package(licenses = ["notice"])
 go_library(
     name = "testutil",
     testonly = 1,
-    srcs = ["testutil.go"],
+    srcs = [
+        "testutil.go",
+        "testutil_runfiles.go",
+    ],
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index edf2e809a..80c2c9680 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -79,60 +79,6 @@ func ConfigureExePath() error {
 	return nil
 }
 
-// FindFile searchs for a file inside the test run environment. It returns the
-// full path to the file. It fails if none or more than one file is found.
-func FindFile(path string) (string, error) {
-	wd, err := os.Getwd()
-	if err != nil {
-		return "", err
-	}
-
-	// The test root is demarcated by a path element called "__main__". Search for
-	// it backwards from the working directory.
-	root := wd
-	for {
-		dir, name := filepath.Split(root)
-		if name == "__main__" {
-			break
-		}
-		if len(dir) == 0 {
-			return "", fmt.Errorf("directory __main__ not found in %q", wd)
-		}
-		// Remove ending slash to loop around.
-		root = dir[:len(dir)-1]
-	}
-
-	// Annoyingly, bazel adds the build type to the directory path for go
-	// binaries, but not for c++ binaries. We use two different patterns to
-	// to find our file.
-	patterns := []string{
-		// Try the obvious path first.
-		filepath.Join(root, path),
-		// If it was a go binary, use a wildcard to match the build
-		// type. The pattern is: /test-path/__main__/directories/*/file.
-		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
-	}
-
-	for _, p := range patterns {
-		matches, err := filepath.Glob(p)
-		if err != nil {
-			// "The only possible returned error is ErrBadPattern,
-			// when pattern is malformed." -godoc
-			return "", fmt.Errorf("error globbing %q: %v", p, err)
-		}
-		switch len(matches) {
-		case 0:
-			// Try the next pattern.
-		case 1:
-			// We found it.
-			return matches[0], nil
-		default:
-			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
-		}
-	}
-	return "", fmt.Errorf("file %q not found", path)
-}
-
 // TestConfig returns the default configuration to use in tests. Note that
 // 'RootDir' must be set by caller if required.
 func TestConfig() *boot.Config {
diff --git a/runsc/testutil/testutil_runfiles.go b/runsc/testutil/testutil_runfiles.go
new file mode 100644
index 000000000..ece9ea9a1
--- /dev/null
+++ b/runsc/testutil/testutil_runfiles.go
@@ -0,0 +1,75 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+// FindFile searchs for a file inside the test run environment. It returns the
+// full path to the file. It fails if none or more than one file is found.
+func FindFile(path string) (string, error) {
+	wd, err := os.Getwd()
+	if err != nil {
+		return "", err
+	}
+
+	// The test root is demarcated by a path element called "__main__". Search for
+	// it backwards from the working directory.
+	root := wd
+	for {
+		dir, name := filepath.Split(root)
+		if name == "__main__" {
+			break
+		}
+		if len(dir) == 0 {
+			return "", fmt.Errorf("directory __main__ not found in %q", wd)
+		}
+		// Remove ending slash to loop around.
+		root = dir[:len(dir)-1]
+	}
+
+	// Annoyingly, bazel adds the build type to the directory path for go
+	// binaries, but not for c++ binaries. We use two different patterns to
+	// to find our file.
+	patterns := []string{
+		// Try the obvious path first.
+		filepath.Join(root, path),
+		// If it was a go binary, use a wildcard to match the build
+		// type. The pattern is: /test-path/__main__/directories/*/file.
+		filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)),
+	}
+
+	for _, p := range patterns {
+		matches, err := filepath.Glob(p)
+		if err != nil {
+			// "The only possible returned error is ErrBadPattern,
+			// when pattern is malformed." -godoc
+			return "", fmt.Errorf("error globbing %q: %v", p, err)
+		}
+		switch len(matches) {
+		case 0:
+			// Try the next pattern.
+		case 1:
+			// We found it.
+			return matches[0], nil
+		default:
+			return "", fmt.Errorf("more than one match found for %q: %s", path, matches)
+		}
+	}
+	return "", fmt.Errorf("file %q not found", path)
+}
diff --git a/test/image/image_test.go b/test/image/image_test.go
index d0dcb1861..0a1e19d6f 100644
--- a/test/image/image_test.go
+++ b/test/image/image_test.go
@@ -107,7 +107,7 @@ func TestHttpd(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("http-test")
 
-	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -139,7 +139,7 @@ func TestNginx(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("net-test")
 
-	dir, err := dockerutil.PrepareFiles("latin10k.txt")
+	dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -183,7 +183,7 @@ func TestMysql(t *testing.T) {
 	}
 
 	client := dockerutil.MakeDocker("mysql-client-test")
-	dir, err := dockerutil.PrepareFiles("mysql.sql")
+	dir, err := dockerutil.PrepareFiles("test/image/mysql.sql")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
@@ -283,7 +283,7 @@ func TestRuby(t *testing.T) {
 	}
 	d := dockerutil.MakeDocker("ruby-test")
 
-	dir, err := dockerutil.PrepareFiles("ruby.rb", "ruby.sh")
+	dir, err := dockerutil.PrepareFiles("test/image/ruby.rb", "test/image/ruby.sh")
 	if err != nil {
 		t.Fatalf("PrepareFiles() failed: %v", err)
 	}
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index 1df761dd0..cbab85ef7 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -2,8 +2,6 @@
 
 load("//tools:defs.bzl", "loopback")
 
-# syscall_test is a macro that will create targets to run the given test target
-# on the host (native) and runsc.
 def syscall_test(
         test,
         shard_count = 5,
@@ -13,6 +11,19 @@ def syscall_test(
         add_uds_tree = False,
         add_hostinet = False,
         tags = None):
+    """syscall_test is a macro that will create targets for all platforms.
+
+    Args:
+      test: the test target.
+      shard_count: shards for defined tests.
+      size: the defined test size.
+      use_tmpfs: use tmpfs in the defined tests.
+      add_overlay: add an overlay test.
+      add_uds_tree: add a UDS test.
+      add_hostinet: add a hostinet test.
+      tags: starting test tags.
+    """
+
     _syscall_test(
         test = test,
         shard_count = shard_count,
@@ -111,6 +122,19 @@ def _syscall_test(
     # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
     tags += [full_platform, "file_" + file_access]
 
+    # Hash this target into one of 15 buckets. This can be used to
+    # randomly split targets between different workflows.
+    hash15 = hash(native.package_name() + name) % 15
+    tags.append("hash15:" + str(hash15))
+
+    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
+    # we figure out how to request ipv4 sockets on Guitar machines.
+    if network == "host":
+        tags.append("noguitar")
+
+    # Disable off-host networking.
+    tags.append("requires-net:loopback")
+
     # Add tag to prevent the tests from running in a Bazel sandbox.
     # TODO(b/120560048): Make the tests run without this tag.
     tags.append("no-sandbox")
@@ -118,8 +142,11 @@ def _syscall_test(
     # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
     # more stable.
     if platform == "kvm":
-        tags += ["manual"]
-        tags += ["requires-kvm"]
+        tags.append("manual")
+        tags.append("requires-kvm")
+
+        # TODO(b/112165693): Remove when tests pass reliably.
+        tags.append("notap")
 
     args = [
         # Arguments are passed directly to syscall_test_runner binary.
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 0a2d44a2c..85ec013d5 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -167,7 +167,7 @@ TEST(ChrootTest, DotDotFromOpenFD) {
 }
 
 // Test that link resolution in a chroot can escape the root by following an
-// open proc fd.
+// open proc fd. Regression test for b/32316719.
 TEST(ChrootTest, ProcFdLinkResolutionInChroot) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
 
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
index f41f99900..7cd6a75bd 100644
--- a/test/syscalls/linux/concurrency.cc
+++ b/test/syscalls/linux/concurrency.cc
@@ -46,7 +46,8 @@ TEST(ConcurrencyTest, SingleProcessMultithreaded) {
 }
 
 // Test that multiple threads in this process continue to execute in parallel,
-// even if an unrelated second process is spawned.
+// even if an unrelated second process is spawned. Regression test for
+// b/32119508.
 TEST(ConcurrencyTest, MultiProcessMultithreaded) {
   // In PID 1, start TIDs 1 and 2, and put both to sleep.
   //
diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc
index b790fe5be..2989379b7 100644
--- a/test/syscalls/linux/exec_proc_exe_workload.cc
+++ b/test/syscalls/linux/exec_proc_exe_workload.cc
@@ -21,6 +21,12 @@
 #include "test/util/posix_error.h"
 
 int main(int argc, char** argv, char** envp) {
+  // This is annoying. Because remote build systems may put these binaries
+  // in a content-addressable-store, you may wind up with /proc/self/exe
+  // pointing to some random path (but with a sensible argv[0]).
+  //
+  // Therefore, this test simply checks that the /proc/self/exe
+  // is absolute and *doesn't* match argv[1].
   std::string exe =
       gvisor::testing::ProcessExePath(getpid()).ValueOrDie();
   if (exe[0] != '/') {
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 906f3358d..ff8bdfeb0 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -271,7 +271,7 @@ TEST_F(ForkTest, Alarm) {
   EXPECT_EQ(0, alarmed);
 }
 
-// Child cannot affect parent private memory.
+// Child cannot affect parent private memory. Regression test for b/24137240.
 TEST_F(ForkTest, PrivateMemory) {
   std::atomic<uint32_t> local(0);
 
@@ -298,6 +298,9 @@ TEST_F(ForkTest, PrivateMemory) {
 }
 
 // Kernel-accessed buffers should remain coherent across COW.
+//
+// The buffer must be >= usermem.ZeroCopyMinBytes, as UnsafeAccess operates
+// differently. Regression test for b/33811887.
 TEST_F(ForkTest, COWSegment) {
   constexpr int kBufSize = 1024;
   char* read_buf = private_;
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 1c4d9f1c7..11fb1b457 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -1418,7 +1418,7 @@ TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) {
 //
 // On most platforms this is trivial, but when the file is mapped via the sentry
 // page cache (which does not yet support writing to shared mappings), a bug
-// caused reads to fail unnecessarily on such mappings.
+// caused reads to fail unnecessarily on such mappings. See b/28913513.
 TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
   uintptr_t addr;
   size_t len = strlen(kFileContents);
@@ -1435,7 +1435,7 @@ TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
 
 // Tests that EFAULT is returned when invoking a syscall that requires the OS to
 // read past end of file (resulting in a fault in sentry context in the gVisor
-// case).
+// case). See b/28913513.
 TEST_F(MMapFileTest, InternalSigBus) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
@@ -1578,7 +1578,7 @@ TEST_F(MMapFileTest, Bug38498194) {
 }
 
 // Tests that reading from a file to a memory mapping of the same file does not
-// deadlock.
+// deadlock. See b/34813270.
 TEST_F(MMapFileTest, SelfRead) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
@@ -1590,7 +1590,7 @@ TEST_F(MMapFileTest, SelfRead) {
 }
 
 // Tests that writing to a file from a memory mapping of the same file does not
-// deadlock.
+// deadlock. Regression test for b/34813270.
 TEST_F(MMapFileTest, SelfWrite) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index 431733dbe..902d0a0dc 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -132,6 +132,7 @@ TEST(CreateTest, CreateFailsOnDirWithoutWritePerms) {
 }
 
 // A file originally created RW, but opened RO can later be opened RW.
+// Regression test for b/65385065.
 TEST(CreateTest, OpenCreateROThenRW) {
   TempPath file(NewTempAbsPath());
 
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
index f7ea44054..5b0743fe9 100644
--- a/test/syscalls/linux/preadv.cc
+++ b/test/syscalls/linux/preadv.cc
@@ -37,6 +37,7 @@ namespace testing {
 
 namespace {
 
+// Stress copy-on-write. Attempts to reproduce b/38430174.
 TEST(PreadvTest, MMConcurrencyStress) {
   // Fill a one-page file with zeroes (the contents don't really matter).
   const auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 169b723eb..a23fdb58d 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1352,13 +1352,19 @@ TEST(ProcPidSymlink, SubprocessZombied) {
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
-  // 4.17 & gVisor: Syscall succeeds and returns 1
+  //
+  // ~4.3: Syscall fails with EACCES.
+  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
-  // 4.17 &  gVisor: Syscall succeeds and returns 1.
+  //
+  // ~4.3: Syscall fails with EACCES.
+  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 }
@@ -1431,8 +1437,12 @@ TEST(ProcPidFile, SubprocessRunning) {
 TEST(ProcPidFile, SubprocessZombie) {
   char buf[1];
 
-  // 4.17: Succeeds and returns 1
-  // gVisor: Succeeds and returns 0
+  // FIXME(gvisor.dev/issue/164): Loosen requirement due to inconsistent
+  // behavior on different kernels.
+  //
+  // ~4.3: Succeds and returns 0.
+  // 4.17: Succeeds and returns 1.
+  // gVisor: Succeeds and returns 0.
   EXPECT_THAT(ReadWhileZombied("auxv", buf, sizeof(buf)), SyscallSucceeds());
 
   EXPECT_THAT(ReadWhileZombied("cmdline", buf, sizeof(buf)),
@@ -1458,7 +1468,10 @@ TEST(ProcPidFile, SubprocessZombie) {
 
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
+  //
+  // ~4.3: Fails and returns EACCES.
   // gVisor & 4.17: Succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadWhileZombied("io", buf, sizeof(buf)),
   //          SyscallFailsWithErrno(EACCES));
 }
@@ -1467,9 +1480,12 @@ TEST(ProcPidFile, SubprocessZombie) {
 TEST(ProcPidFile, SubprocessExited) {
   char buf[1];
 
-  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels.
+  //
+  // ~4.3: Fails and returns ESRCH.
   // gVisor: Fails with ESRCH.
   // 4.17: Succeeds and returns 1.
+  //
   // EXPECT_THAT(ReadWhileExited("auxv", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(ESRCH));
 
@@ -1641,7 +1657,7 @@ TEST(ProcTask, KilledThreadsDisappear) {
   EXPECT_NO_ERRNO(DirContainsExactly("/proc/self/task",
                                      TaskFiles(initial, {child1.Tid()})));
 
-  // Stat child1's task file.
+  // Stat child1's task file. Regression test for b/32097707.
   struct stat statbuf;
   const std::string child1_task_file =
       absl::StrCat("/proc/self/task/", child1.Tid());
@@ -1669,7 +1685,7 @@ TEST(ProcTask, KilledThreadsDisappear) {
   EXPECT_NO_ERRNO(EventuallyDirContainsExactly(
       "/proc/self/task", TaskFiles(initial, {child3.Tid(), child5.Tid()})));
 
-  // Stat child1's task file again.  This time it should fail.
+  // Stat child1's task file again.  This time it should fail. See b/32097707.
   EXPECT_THAT(stat(child1_task_file.c_str(), &statbuf),
               SyscallFailsWithErrno(ENOENT));
 
@@ -1824,7 +1840,7 @@ TEST(ProcSysVmOvercommitMemory, HasNumericValue) {
 }
 
 // Check that link for proc fd entries point the target node, not the
-// symlink itself.
+// symlink itself. Regression test for b/31155070.
 TEST(ProcTaskFd, FstatatFollowsSymlink) {
   const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   const FileDescriptor fd =
@@ -1883,6 +1899,20 @@ TEST(ProcMounts, IsSymlink) {
   EXPECT_EQ(link, "self/mounts");
 }
 
+TEST(ProcSelfMountinfo, RequiredFieldsArePresent) {
+  auto mountinfo =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mountinfo"));
+  EXPECT_THAT(
+      mountinfo,
+      AllOf(
+          // Root mount.
+          ContainsRegex(
+              R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / / (rw|ro).*- \S+ \S+ (rw|ro)\S*)"),
+          // Proc mount - always rw.
+          ContainsRegex(
+              R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / /proc rw.*- \S+ \S+ rw\S*)")));
+}
+
 // Check that /proc/self/mounts looks something like a real mounts file.
 TEST(ProcSelfMounts, RequiredFieldsArePresent) {
   auto mounts = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mounts"));
diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc
index 4069cbc7e..baaf9f757 100644
--- a/test/syscalls/linux/readv.cc
+++ b/test/syscalls/linux/readv.cc
@@ -254,7 +254,9 @@ TEST_F(ReadvTest, IovecOutsideTaskAddressRangeInNonemptyArray) {
 // This test depends on the maximum extent of a single readv() syscall, so
 // we can't tolerate interruption from saving.
 TEST(ReadvTestNoFixture, TruncatedAtMax_NoRandomSave) {
-  // Ensure that we won't be interrupted by ITIMER_PROF.
+  // Ensure that we won't be interrupted by ITIMER_PROF. This is particularly
+  // important in environments where automated profiling tools may start
+  // ITIMER_PROF automatically.
   struct itimerval itv = {};
   auto const cleanup_itimer =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_PROF, itv));
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
index 106c045e3..4bfb1ff56 100644
--- a/test/syscalls/linux/rseq.cc
+++ b/test/syscalls/linux/rseq.cc
@@ -36,7 +36,7 @@ namespace {
 // We must be very careful about how these tests are written. Each thread may
 // only have one struct rseq registration, which may be done automatically at
 // thread start (as of 2019-11-13, glibc does *not* support rseq and thus does
-// not do so).
+// not do so, but other libraries do).
 //
 // Testing of rseq is thus done primarily in a child process with no
 // registration. This means exec'ing a nostdlib binary, as rseq registration can
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index 424e2a67f..be2364fb8 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -146,7 +146,7 @@ TEST_F(SelectTest, IgnoreBitsAboveNfds) {
 
 // This test illustrates Linux's behavior of 'select' calls passing after
 // setrlimit RLIMIT_NOFILE is called. In particular, versions of sshd rely on
-// this behavior.
+// this behavior. See b/122318458.
 TEST_F(SelectTest, SetrlimitCallNOFILE) {
   fd_set read_set;
   FD_ZERO(&read_set);
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
index 7ba752599..c7fdbb924 100644
--- a/test/syscalls/linux/shm.cc
+++ b/test/syscalls/linux/shm.cc
@@ -473,7 +473,7 @@ TEST(ShmTest, PartialUnmap) {
 }
 
 // Check that sentry does not panic when asked for a zero-length private shm
-// segment.
+// segment. Regression test for b/110694797.
 TEST(ShmTest, GracefullyFailOnZeroLenSegmentCreation) {
   EXPECT_THAT(Shmget(IPC_PRIVATE, 0, 0), PosixErrorIs(EINVAL, _));
 }
diff --git a/test/syscalls/linux/sigprocmask.cc b/test/syscalls/linux/sigprocmask.cc
index 654c6a47f..a603fc1d1 100644
--- a/test/syscalls/linux/sigprocmask.cc
+++ b/test/syscalls/linux/sigprocmask.cc
@@ -237,7 +237,7 @@ TEST_F(SigProcMaskTest, SignalHandler) {
 }
 
 // Check that sigprocmask correctly handles aliasing of the set and oldset
-// pointers.
+// pointers. Regression test for b/30502311.
 TEST_F(SigProcMaskTest, AliasedSets) {
   sigset_t mask;
 
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index 276a94eb8..884319e1d 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -109,7 +109,7 @@ PosixErrorOr<std::vector<Mapping>> CreateFragmentedRegion(const int size,
 }
 
 // A contiguous iov that is heavily fragmented in FileMem can still be sent
-// successfully.
+// successfully. See b/115833655.
 TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -165,7 +165,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
 }
 
 // A contiguous iov that is heavily fragmented in FileMem can still be received
-// into successfully.
+// into successfully. Regression test for b/115833655.
 TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
index b249ff91f..03ee1250d 100644
--- a/test/syscalls/linux/symlink.cc
+++ b/test/syscalls/linux/symlink.cc
@@ -38,7 +38,7 @@ mode_t FilePermission(const std::string& path) {
 }
 
 // Test that name collisions are checked on the new link path, not the source
-// path.
+// path. Regression test for b/31782115.
 TEST(SymlinkTest, CanCreateSymlinkWithCachedSourceDirent) {
   const std::string srcname = NewTempAbsPath();
   const std::string newname = NewTempAbsPath();
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 8a8b68e75..c4591a3b9 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -244,7 +244,8 @@ TEST_P(TcpSocketTest, ZeroWriteAllowed) {
 }
 
 // Test that a non-blocking write with a buffer that is larger than the send
-// buffer size will not actually write the whole thing at once.
+// buffer size will not actually write the whole thing at once. Regression test
+// for b/64438887.
 TEST_P(TcpSocketTest, NonblockingLargeWrite) {
   // Set the FD to O_NONBLOCK.
   int opts;
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index c7eead17e..1ccb95733 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -62,6 +62,7 @@ TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) {
               ::testing::KilledBySignal(SIGSEGV), "");
 }
 
+// Mimics the gettimeofday(2) wrapper from the Go runtime <= 1.2.
 int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) {
   constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000;
   return reinterpret_cast<int (*)(struct timeval*, struct timezone*)>(
diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc
index bae377c69..8d8ebbb24 100644
--- a/test/syscalls/linux/tkill.cc
+++ b/test/syscalls/linux/tkill.cc
@@ -54,7 +54,7 @@ void SigHandler(int sig, siginfo_t* info, void* context) {
   TEST_CHECK(info->si_code == SI_TKILL);
 }
 
-// Test with a real signal.
+// Test with a real signal. Regression test for b/24790092.
 TEST(TkillTest, ValidTIDAndRealSignal) {
   struct sigaction sa;
   sa.sa_sigaction = SigHandler;
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index 35aacb172..9c10b6674 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -77,6 +77,7 @@ std::string NewTempAbsPath() {
 std::string NewTempRelPath() { return NextTempBasename(); }
 
 std::string GetAbsoluteTestTmpdir() {
+  // Note that TEST_TMPDIR is guaranteed to be set.
   char* env_tmpdir = getenv("TEST_TMPDIR");
   std::string tmp_dir =
       env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp";
diff --git a/tools/build/tags.bzl b/tools/build/tags.bzl
index e99c87f81..a6db44e47 100644
--- a/tools/build/tags.bzl
+++ b/tools/build/tags.bzl
@@ -33,4 +33,8 @@ go_suffixes = [
     "_wasm_unsafe",
     "_linux",
     "_linux_unsafe",
+    "_opts",
+    "_opts_unsafe",
+    "_impl",
+    "_impl_unsafe",
 ]
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 5d5fa134a..c03b557ae 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -73,6 +73,16 @@ def calculate_sets(srcs):
             result[target].append(file)
     return result
 
+def go_imports(name, src, out):
+    """Simplify a single Go source file by eliminating unused imports."""
+    native.genrule(
+        name = name,
+        srcs = [src],
+        outs = [out],
+        tools = ["@org_golang_x_tools//cmd/goimports:goimports"],
+        cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"),
+    )
+
 def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs):
     """Wraps the standard go_library and does stateification and marshalling.
 
@@ -107,10 +117,15 @@ def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = F
         state_sets = calculate_sets(srcs)
         for (suffix, srcs) in state_sets.items():
             go_stateify(
-                name = name + suffix + "_state_autogen",
+                name = name + suffix + "_state_autogen_with_imports",
                 srcs = srcs,
                 imports = imports,
                 package = name,
+                out = name + suffix + "_state_autogen_with_imports.go",
+            )
+            go_imports(
+                name = name + suffix + "_state_autogen",
+                src = name + suffix + "_state_autogen_with_imports.go",
                 out = name + suffix + "_state_autogen.go",
             )
         all_srcs = all_srcs + [
-- 
cgit v1.2.3


From 0e96fcafd4404e1418c84b7830b9455867e174bb Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 6 Feb 2020 10:11:15 -0800
Subject: Fix test case on AMD.

When ignored, the trap should be executed which generates
a SIGSEGV as in the above case.

PiperOrigin-RevId: 293618489
---
 test/syscalls/linux/32bit.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 9883aef61..c47a05181 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -155,7 +155,7 @@ TEST(Syscall32Bit, Syscall) {
     case PlatformSupport::Ignored:
       // See above.
       EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
-                  ::testing::KilledBySignal(SIGILL), "");
+                  ::testing::KilledBySignal(SIGSEGV), "");
       break;
 
     case PlatformSupport::Allowed:
-- 
cgit v1.2.3


From 6de49546cb32806896cec27d3ab76e96323ecac1 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Fri, 7 Feb 2020 13:18:19 -0800
Subject: Refactor syscall tests

- Move shared helpers V4Multicast and V4Broadcast to socket_test_util
- Add unnamed namespace so socket_ipv4_tcp_unbound_external_networking_test.cc
  and socket_ipv4_udp_unbound_external_networking_test.cc can be compiled
  together
- Add test files to "exports_files" so they can be included by Fuchsia's syscall
  test setup

PiperOrigin-RevId: 293880429
---
 test/syscalls/linux/BUILD                           |  3 +++
 ...ket_ipv4_tcp_unbound_external_networking_test.cc |  3 +++
 test/syscalls/linux/socket_ipv4_udp_unbound.cc      | 21 ---------------------
 .../socket_ipv4_udp_unbound_external_networking.cc  | 20 --------------------
 ...ket_ipv4_udp_unbound_external_networking_test.cc |  3 +++
 test/syscalls/linux/socket_test_util.cc             | 18 ++++++++++++++++++
 test/syscalls/linux/socket_test_util.h              |  5 +++++
 7 files changed, 32 insertions(+), 41 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index f2e3c7072..12d389c3e 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -12,6 +12,9 @@ exports_files(
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_loopback.cc",
         "socket_ip_udp_loopback.cc",
+        "socket_ip_unbound.cc",
+        "socket_ipv4_tcp_unbound_external_networking_test.cc",
+        "socket_ipv4_udp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
         "udp_socket.cc",
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
index 3ac790873..797c4174e 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
@@ -22,6 +22,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketKind> GetSockets() {
   return ApplyVec<SocketKind>(
@@ -32,5 +33,7 @@ std::vector<SocketKind> GetSockets() {
 INSTANTIATE_TEST_SUITE_P(IPv4TCPUnboundSockets,
                          IPv4TCPUnboundExternalNetworkingSocketTest,
                          ::testing::ValuesIn(GetSockets()));
+
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index aa6fb4e3f..990ccf23c 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -30,27 +30,6 @@
 namespace gvisor {
 namespace testing {
 
-constexpr char kMulticastAddress[] = "224.0.2.1";
-constexpr char kBroadcastAddress[] = "255.255.255.255";
-
-TestAddress V4Multicast() {
-  TestAddress t("V4Multicast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      inet_addr(kMulticastAddress);
-  return t;
-}
-
-TestAddress V4Broadcast() {
-  TestAddress t("V4Broadcast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      inet_addr(kBroadcastAddress);
-  return t;
-}
-
 // Check that packets are not received without a group membership. Default send
 // interface configured by bind.
 TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNoGroup) {
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 98ae414f3..40e673625 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -41,26 +41,6 @@ TestAddress V4EmptyAddress() {
   return t;
 }
 
-constexpr char kMulticastAddress[] = "224.0.2.1";
-
-TestAddress V4Multicast() {
-  TestAddress t("V4Multicast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      inet_addr(kMulticastAddress);
-  return t;
-}
-
-TestAddress V4Broadcast() {
-  TestAddress t("V4Broadcast");
-  t.addr.ss_family = AF_INET;
-  t.addr_len = sizeof(sockaddr_in);
-  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
-      htonl(INADDR_BROADCAST);
-  return t;
-}
-
 void IPv4UDPUnboundExternalNetworkingSocketTest::SetUp() {
   got_if_infos_ = false;
 
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
index 8f47952b0..f6e64c157 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
@@ -22,6 +22,7 @@
 
 namespace gvisor {
 namespace testing {
+namespace {
 
 std::vector<SocketKind> GetSockets() {
   return ApplyVec<SocketKind>(
@@ -32,5 +33,7 @@ std::vector<SocketKind> GetSockets() {
 INSTANTIATE_TEST_SUITE_P(IPv4UDPUnboundSockets,
                          IPv4UDPUnboundExternalNetworkingSocketTest,
                          ::testing::ValuesIn(GetSockets()));
+
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index c0c5ab3fe..5d3a39868 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -805,6 +805,24 @@ TestAddress V4MappedLoopback() {
   return t;
 }
 
+TestAddress V4Multicast() {
+  TestAddress t("V4Multicast");
+  t.addr.ss_family = AF_INET;
+  t.addr_len = sizeof(sockaddr_in);
+  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+      inet_addr(kMulticastAddress);
+  return t;
+}
+
+TestAddress V4Broadcast() {
+  TestAddress t("V4Broadcast");
+  t.addr.ss_family = AF_INET;
+  t.addr_len = sizeof(sockaddr_in);
+  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+      htonl(INADDR_BROADCAST);
+  return t;
+}
+
 TestAddress V6Any() {
   TestAddress t("V6Any");
   t.addr.ss_family = AF_INET6;
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index bfaa6e397..734b48b96 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -484,10 +484,15 @@ struct TestAddress {
       : description(std::move(description)), addr(), addr_len() {}
 };
 
+constexpr char kMulticastAddress[] = "224.0.2.1";
+constexpr char kBroadcastAddress[] = "255.255.255.255";
+
 TestAddress V4Any();
+TestAddress V4Broadcast();
 TestAddress V4Loopback();
 TestAddress V4MappedAny();
 TestAddress V4MappedLoopback();
+TestAddress V4Multicast();
 TestAddress V6Any();
 TestAddress V6Loopback();
 
-- 
cgit v1.2.3


From 17b9f5e66238bde1e4ed3bd9e5fb67342c8b58ec Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 7 Feb 2020 14:46:24 -0800
Subject: Support listxattr and removexattr syscalls.

Note that these are only implemented for tmpfs, and other impls will still
return EOPNOTSUPP.

PiperOrigin-RevId: 293899385
---
 pkg/p9/client_file.go                      |  33 ++++
 pkg/p9/file.go                             |  16 ++
 pkg/p9/handlers.go                         |  33 ++++
 pkg/p9/messages.go                         | 199 +++++++++++++++----
 pkg/p9/p9.go                               |   4 +
 pkg/p9/version.go                          |   8 +-
 pkg/sentry/fs/copy_up.go                   |   2 +-
 pkg/sentry/fs/fsutil/inode.go              |  20 +-
 pkg/sentry/fs/gofer/context_file.go        |  14 ++
 pkg/sentry/fs/gofer/inode.go               |  13 +-
 pkg/sentry/fs/inode.go                     |  14 +-
 pkg/sentry/fs/inode_operations.go          |  13 +-
 pkg/sentry/fs/inode_overlay.go             |  18 +-
 pkg/sentry/fs/tmpfs/tmpfs.go               |   9 +-
 pkg/sentry/syscalls/linux/linux64_amd64.go |  27 ++-
 pkg/sentry/syscalls/linux/linux64_arm64.go |  37 ++--
 pkg/sentry/syscalls/linux/sys_xattr.go     | 200 +++++++++++++++++++-
 runsc/fsgofer/fsgofer.go                   |  16 +-
 test/syscalls/linux/BUILD                  |   1 +
 test/syscalls/linux/xattr.cc               | 294 ++++++++++++++++-------------
 20 files changed, 733 insertions(+), 238 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 0254e4ccc..2ee07b664 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -194,6 +194,39 @@ func (c *clientFile) SetXattr(name, value string, flags uint32) error {
 	return c.client.sendRecv(&Tsetxattr{FID: c.fid, Name: name, Value: value, Flags: flags}, &Rsetxattr{})
 }
 
+// ListXattr implements File.ListXattr.
+func (c *clientFile) ListXattr(size uint64) (map[string]struct{}, error) {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return nil, syscall.EBADF
+	}
+	if !versionSupportsListRemoveXattr(c.client.version) {
+		return nil, syscall.EOPNOTSUPP
+	}
+
+	rlistxattr := Rlistxattr{}
+	if err := c.client.sendRecv(&Tlistxattr{FID: c.fid, Size: size}, &rlistxattr); err != nil {
+		return nil, err
+	}
+
+	xattrs := make(map[string]struct{}, len(rlistxattr.Xattrs))
+	for _, x := range rlistxattr.Xattrs {
+		xattrs[x] = struct{}{}
+	}
+	return xattrs, nil
+}
+
+// RemoveXattr implements File.RemoveXattr.
+func (c *clientFile) RemoveXattr(name string) error {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return syscall.EBADF
+	}
+	if !versionSupportsListRemoveXattr(c.client.version) {
+		return syscall.EOPNOTSUPP
+	}
+
+	return c.client.sendRecv(&Tremovexattr{FID: c.fid, Name: name}, &Rremovexattr{})
+}
+
 // Allocate implements File.Allocate.
 func (c *clientFile) Allocate(mode AllocateMode, offset, length uint64) error {
 	if atomic.LoadUint32(&c.closed) != 0 {
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index 4607cfcdf..d4ffbc8e3 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -105,6 +105,22 @@ type File interface {
 	// TODO(b/127675828): Determine concurrency guarantees once implemented.
 	SetXattr(name, value string, flags uint32) error
 
+	// ListXattr lists the names of the extended attributes on this node.
+	//
+	// Size indicates the size of the buffer that has been allocated to hold the
+	// attribute list. If the list would be larger than size, implementations may
+	// return ERANGE to indicate that the buffer is too small, but they are also
+	// free to ignore the hint entirely (i.e. the value returned may be larger
+	// than size). All size checking is done independently at the syscall layer.
+	//
+	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	ListXattr(size uint64) (map[string]struct{}, error)
+
+	// RemoveXattr removes extended attributes on this node.
+	//
+	// TODO(b/148303075): Determine concurrency guarantees once implemented.
+	RemoveXattr(name string) error
+
 	// Allocate allows the caller to directly manipulate the allocated disk space
 	// for the file. See fallocate(2) for more details.
 	Allocate(mode AllocateMode, offset, length uint64) error
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 7d6653a07..2ac45eb80 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -941,6 +941,39 @@ func (t *Tsetxattr) handle(cs *connState) message {
 	return &Rsetxattr{}
 }
 
+// handle implements handler.handle.
+func (t *Tlistxattr) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	xattrs, err := ref.file.ListXattr(t.Size)
+	if err != nil {
+		return newErr(err)
+	}
+	xattrList := make([]string, 0, len(xattrs))
+	for x := range xattrs {
+		xattrList = append(xattrList, x)
+	}
+	return &Rlistxattr{Xattrs: xattrList}
+}
+
+// handle implements handler.handle.
+func (t *Tremovexattr) handle(cs *connState) message {
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	if err := ref.file.RemoveXattr(t.Name); err != nil {
+		return newErr(err)
+	}
+	return &Rremovexattr{}
+}
+
 // handle implements handler.handle.
 func (t *Treaddir) handle(cs *connState) message {
 	ref, ok := cs.LookupFID(t.Directory)
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index ceb723d86..b1cede5f5 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -174,11 +174,11 @@ type Rflush struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rflush) Decode(b *buffer) {
+func (*Rflush) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rflush) Encode(b *buffer) {
+func (*Rflush) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -188,7 +188,7 @@ func (*Rflush) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rflush) String() string {
-	return fmt.Sprintf("RFlush{}")
+	return "RFlush{}"
 }
 
 // Twalk is a walk request.
@@ -300,11 +300,11 @@ type Rclunk struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rclunk) Decode(b *buffer) {
+func (*Rclunk) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rclunk) Encode(b *buffer) {
+func (*Rclunk) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -314,7 +314,7 @@ func (*Rclunk) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rclunk) String() string {
-	return fmt.Sprintf("Rclunk{}")
+	return "Rclunk{}"
 }
 
 // Tremove is a remove request.
@@ -350,11 +350,11 @@ type Rremove struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rremove) Decode(b *buffer) {
+func (*Rremove) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rremove) Encode(b *buffer) {
+func (*Rremove) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -364,7 +364,7 @@ func (*Rremove) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rremove) String() string {
-	return fmt.Sprintf("Rremove{}")
+	return "Rremove{}"
 }
 
 // Rlerror is an error response.
@@ -745,16 +745,16 @@ func (*Rlink) Type() MsgType {
 }
 
 // Decode implements encoder.Decode.
-func (*Rlink) Decode(b *buffer) {
+func (*Rlink) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rlink) Encode(b *buffer) {
+func (*Rlink) Encode(*buffer) {
 }
 
 // String implements fmt.Stringer.
 func (r *Rlink) String() string {
-	return fmt.Sprintf("Rlink{}")
+	return "Rlink{}"
 }
 
 // Trenameat is a rename request.
@@ -803,11 +803,11 @@ type Rrenameat struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rrenameat) Decode(b *buffer) {
+func (*Rrenameat) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rrenameat) Encode(b *buffer) {
+func (*Rrenameat) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -817,7 +817,7 @@ func (*Rrenameat) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rrenameat) String() string {
-	return fmt.Sprintf("Rrenameat{}")
+	return "Rrenameat{}"
 }
 
 // Tunlinkat is an unlink request.
@@ -861,11 +861,11 @@ type Runlinkat struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Runlinkat) Decode(b *buffer) {
+func (*Runlinkat) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Runlinkat) Encode(b *buffer) {
+func (*Runlinkat) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -875,7 +875,7 @@ func (*Runlinkat) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Runlinkat) String() string {
-	return fmt.Sprintf("Runlinkat{}")
+	return "Runlinkat{}"
 }
 
 // Trename is a rename request.
@@ -922,11 +922,11 @@ type Rrename struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rrename) Decode(b *buffer) {
+func (*Rrename) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rrename) Encode(b *buffer) {
+func (*Rrename) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -936,7 +936,7 @@ func (*Rrename) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rrename) String() string {
-	return fmt.Sprintf("Rrename{}")
+	return "Rrename{}"
 }
 
 // Treadlink is a readlink request.
@@ -1409,11 +1409,11 @@ type Rsetattr struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rsetattr) Decode(b *buffer) {
+func (*Rsetattr) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rsetattr) Encode(b *buffer) {
+func (*Rsetattr) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1423,7 +1423,7 @@ func (*Rsetattr) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rsetattr) String() string {
-	return fmt.Sprintf("Rsetattr{}")
+	return "Rsetattr{}"
 }
 
 // Tallocate is an allocate request. This is an extension to 9P protocol, not
@@ -1466,11 +1466,11 @@ type Rallocate struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rallocate) Decode(b *buffer) {
+func (*Rallocate) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rallocate) Encode(b *buffer) {
+func (*Rallocate) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1480,7 +1480,71 @@ func (*Rallocate) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rallocate) String() string {
-	return fmt.Sprintf("Rallocate{}")
+	return "Rallocate{}"
+}
+
+// Tlistxattr is a listxattr request.
+type Tlistxattr struct {
+	// FID refers to the file on which to list xattrs.
+	FID FID
+
+	// Size is the buffer size for the xattr list.
+	Size uint64
+}
+
+// Decode implements encoder.Decode.
+func (t *Tlistxattr) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Size = b.Read64()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tlistxattr) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	b.Write64(t.Size)
+}
+
+// Type implements message.Type.
+func (*Tlistxattr) Type() MsgType {
+	return MsgTlistxattr
+}
+
+// String implements fmt.Stringer.
+func (t *Tlistxattr) String() string {
+	return fmt.Sprintf("Tlistxattr{FID: %d, Size: %d}", t.FID, t.Size)
+}
+
+// Rlistxattr is a listxattr response.
+type Rlistxattr struct {
+	// Xattrs is a list of extended attribute names.
+	Xattrs []string
+}
+
+// Decode implements encoder.Decode.
+func (r *Rlistxattr) Decode(b *buffer) {
+	n := b.Read16()
+	r.Xattrs = r.Xattrs[:0]
+	for i := 0; i < int(n); i++ {
+		r.Xattrs = append(r.Xattrs, b.ReadString())
+	}
+}
+
+// Encode implements encoder.Encode.
+func (r *Rlistxattr) Encode(b *buffer) {
+	b.Write16(uint16(len(r.Xattrs)))
+	for _, x := range r.Xattrs {
+		b.WriteString(x)
+	}
+}
+
+// Type implements message.Type.
+func (*Rlistxattr) Type() MsgType {
+	return MsgRlistxattr
+}
+
+// String implements fmt.Stringer.
+func (r *Rlistxattr) String() string {
+	return fmt.Sprintf("Rlistxattr{Xattrs: %v}", r.Xattrs)
 }
 
 // Txattrwalk walks extended attributes.
@@ -1594,11 +1658,11 @@ type Rxattrcreate struct {
 }
 
 // Decode implements encoder.Decode.
-func (r *Rxattrcreate) Decode(b *buffer) {
+func (r *Rxattrcreate) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (r *Rxattrcreate) Encode(b *buffer) {
+func (r *Rxattrcreate) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1608,7 +1672,7 @@ func (*Rxattrcreate) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rxattrcreate) String() string {
-	return fmt.Sprintf("Rxattrcreate{}")
+	return "Rxattrcreate{}"
 }
 
 // Tgetxattr is a getxattr request.
@@ -1719,11 +1783,11 @@ type Rsetxattr struct {
 }
 
 // Decode implements encoder.Decode.
-func (r *Rsetxattr) Decode(b *buffer) {
+func (r *Rsetxattr) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (r *Rsetxattr) Encode(b *buffer) {
+func (r *Rsetxattr) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1733,7 +1797,60 @@ func (*Rsetxattr) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rsetxattr) String() string {
-	return fmt.Sprintf("Rsetxattr{}")
+	return "Rsetxattr{}"
+}
+
+// Tremovexattr is a removexattr request.
+type Tremovexattr struct {
+	// FID refers to the file on which to set xattrs.
+	FID FID
+
+	// Name is the attribute name.
+	Name string
+}
+
+// Decode implements encoder.Decode.
+func (t *Tremovexattr) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Name = b.ReadString()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tremovexattr) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	b.WriteString(t.Name)
+}
+
+// Type implements message.Type.
+func (*Tremovexattr) Type() MsgType {
+	return MsgTremovexattr
+}
+
+// String implements fmt.Stringer.
+func (t *Tremovexattr) String() string {
+	return fmt.Sprintf("Tremovexattr{FID: %d, Name: %s}", t.FID, t.Name)
+}
+
+// Rremovexattr is a removexattr response.
+type Rremovexattr struct {
+}
+
+// Decode implements encoder.Decode.
+func (r *Rremovexattr) Decode(*buffer) {
+}
+
+// Encode implements encoder.Encode.
+func (r *Rremovexattr) Encode(*buffer) {
+}
+
+// Type implements message.Type.
+func (*Rremovexattr) Type() MsgType {
+	return MsgRremovexattr
+}
+
+// String implements fmt.Stringer.
+func (r *Rremovexattr) String() string {
+	return "Rremovexattr{}"
 }
 
 // Treaddir is a readdir request.
@@ -1880,11 +1997,11 @@ type Rfsync struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rfsync) Decode(b *buffer) {
+func (*Rfsync) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rfsync) Encode(b *buffer) {
+func (*Rfsync) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1894,7 +2011,7 @@ func (*Rfsync) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (r *Rfsync) String() string {
-	return fmt.Sprintf("Rfsync{}")
+	return "Rfsync{}"
 }
 
 // Tstatfs is a stat request.
@@ -1980,11 +2097,11 @@ type Rflushf struct {
 }
 
 // Decode implements encoder.Decode.
-func (*Rflushf) Decode(b *buffer) {
+func (*Rflushf) Decode(*buffer) {
 }
 
 // Encode implements encoder.Encode.
-func (*Rflushf) Encode(b *buffer) {
+func (*Rflushf) Encode(*buffer) {
 }
 
 // Type implements message.Type.
@@ -1994,7 +2111,7 @@ func (*Rflushf) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (*Rflushf) String() string {
-	return fmt.Sprintf("Rflushf{}")
+	return "Rflushf{}"
 }
 
 // Twalkgetattr is a walk request.
@@ -2484,6 +2601,8 @@ func init() {
 	msgRegistry.register(MsgRgetattr, func() message { return &Rgetattr{} })
 	msgRegistry.register(MsgTsetattr, func() message { return &Tsetattr{} })
 	msgRegistry.register(MsgRsetattr, func() message { return &Rsetattr{} })
+	msgRegistry.register(MsgTlistxattr, func() message { return &Tlistxattr{} })
+	msgRegistry.register(MsgRlistxattr, func() message { return &Rlistxattr{} })
 	msgRegistry.register(MsgTxattrwalk, func() message { return &Txattrwalk{} })
 	msgRegistry.register(MsgRxattrwalk, func() message { return &Rxattrwalk{} })
 	msgRegistry.register(MsgTxattrcreate, func() message { return &Txattrcreate{} })
@@ -2492,6 +2611,8 @@ func init() {
 	msgRegistry.register(MsgRgetxattr, func() message { return &Rgetxattr{} })
 	msgRegistry.register(MsgTsetxattr, func() message { return &Tsetxattr{} })
 	msgRegistry.register(MsgRsetxattr, func() message { return &Rsetxattr{} })
+	msgRegistry.register(MsgTremovexattr, func() message { return &Tremovexattr{} })
+	msgRegistry.register(MsgRremovexattr, func() message { return &Rremovexattr{} })
 	msgRegistry.register(MsgTreaddir, func() message { return &Treaddir{} })
 	msgRegistry.register(MsgRreaddir, func() message { return &Rreaddir{} })
 	msgRegistry.register(MsgTfsync, func() message { return &Tfsync{} })
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 5ab00d625..20ab31f7a 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -335,6 +335,8 @@ const (
 	MsgRgetattr             = 25
 	MsgTsetattr             = 26
 	MsgRsetattr             = 27
+	MsgTlistxattr           = 28
+	MsgRlistxattr           = 29
 	MsgTxattrwalk           = 30
 	MsgRxattrwalk           = 31
 	MsgTxattrcreate         = 32
@@ -343,6 +345,8 @@ const (
 	MsgRgetxattr            = 35
 	MsgTsetxattr            = 36
 	MsgRsetxattr            = 37
+	MsgTremovexattr         = 38
+	MsgRremovexattr         = 39
 	MsgTreaddir             = 40
 	MsgRreaddir             = 41
 	MsgTfsync               = 50
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index 34a15eb55..09cde9f5a 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -26,7 +26,7 @@ const (
 	//
 	// Clients are expected to start requesting this version number and
 	// to continuously decrement it until a Tversion request succeeds.
-	highestSupportedVersion uint32 = 10
+	highestSupportedVersion uint32 = 11
 
 	// lowestSupportedVersion is the lowest supported version X in a
 	// version string of the format 9P2000.L.Google.X.
@@ -167,3 +167,9 @@ func VersionSupportsOpenTruncateFlag(v uint32) bool {
 func versionSupportsGetSetXattr(v uint32) bool {
 	return v >= 10
 }
+
+// versionSupportsListRemoveXattr returns true if version v supports
+// the Tlistxattr and Tremovexattr messages.
+func versionSupportsListRemoveXattr(v uint32) bool {
+	return v >= 11
+}
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index f6c79e51b..b060a12ff 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -401,7 +401,7 @@ func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error
 	if err != nil {
 		return err
 	}
-	lowerXattr, err := lower.ListXattr(ctx)
+	lowerXattr, err := lower.ListXattr(ctx, linux.XATTR_SIZE_MAX)
 	if err != nil && err != syserror.EOPNOTSUPP {
 		return err
 	}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 252830572..daecc4ffe 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -247,7 +247,7 @@ func (i *InodeSimpleExtendedAttributes) SetXattr(_ context.Context, _ *fs.Inode,
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
+func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) {
 	i.mu.RLock()
 	names := make(map[string]struct{}, len(i.xattrs))
 	for name := range i.xattrs {
@@ -257,6 +257,17 @@ func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (m
 	return names, nil
 }
 
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Inode, name string) error {
+	i.mu.RLock()
+	defer i.mu.RUnlock()
+	if _, ok := i.xattrs[name]; ok {
+		delete(i.xattrs, name)
+		return nil
+	}
+	return syserror.ENOATTR
+}
+
 // staticFile is a file with static contents. It is returned by
 // InodeStaticFileGetter.GetFile.
 //
@@ -460,10 +471,15 @@ func (InodeNoExtendedAttributes) SetXattr(context.Context, *fs.Inode, string, st
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
+func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) {
 	return nil, syserror.EOPNOTSUPP
 }
 
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (InodeNoExtendedAttributes) RemoveXattr(context.Context, *fs.Inode, string) error {
+	return syserror.EOPNOTSUPP
+}
+
 // InodeNoopRelease implements fs.InodeOperations.Release as a noop.
 type InodeNoopRelease struct{}
 
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index 3da818aed..125907d70 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -73,6 +73,20 @@ func (c *contextFile) setXattr(ctx context.Context, name, value string, flags ui
 	return err
 }
 
+func (c *contextFile) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
+	ctx.UninterruptibleSleepStart(false)
+	xattrs, err := c.file.ListXattr(size)
+	ctx.UninterruptibleSleepFinish(false)
+	return xattrs, err
+}
+
+func (c *contextFile) removeXattr(ctx context.Context, name string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := c.file.RemoveXattr(name)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (c *contextFile) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
 	ctx.UninterruptibleSleepStart(false)
 	err := c.file.Allocate(mode, offset, length)
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index ac28174d2..1c934981b 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -604,18 +604,23 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length
 }
 
 // GetXattr implements fs.InodeOperations.GetXattr.
-func (i *inodeOperations) GetXattr(ctx context.Context, inode *fs.Inode, name string, size uint64) (string, error) {
+func (i *inodeOperations) GetXattr(ctx context.Context, _ *fs.Inode, name string, size uint64) (string, error) {
 	return i.fileState.file.getXattr(ctx, name, size)
 }
 
 // SetXattr implements fs.InodeOperations.SetXattr.
-func (i *inodeOperations) SetXattr(ctx context.Context, inode *fs.Inode, name string, value string, flags uint32) error {
+func (i *inodeOperations) SetXattr(ctx context.Context, _ *fs.Inode, name string, value string, flags uint32) error {
 	return i.fileState.file.setXattr(ctx, name, value, flags)
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (i *inodeOperations) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) {
-	return nil, syscall.EOPNOTSUPP
+func (i *inodeOperations) ListXattr(ctx context.Context, _ *fs.Inode, size uint64) (map[string]struct{}, error) {
+	return i.fileState.file.listXattr(ctx, size)
+}
+
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (i *inodeOperations) RemoveXattr(ctx context.Context, _ *fs.Inode, name string) error {
+	return i.fileState.file.removeXattr(ctx, name)
 }
 
 // Allocate implements fs.InodeOperations.Allocate.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b66c091ab..55fb71c16 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -278,11 +278,19 @@ func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, fla
 }
 
 // ListXattr calls i.InodeOperations.ListXattr with i as the Inode.
-func (i *Inode) ListXattr(ctx context.Context) (map[string]struct{}, error) {
+func (i *Inode) ListXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
 	if i.overlay != nil {
-		return overlayListXattr(ctx, i.overlay)
+		return overlayListXattr(ctx, i.overlay, size)
 	}
-	return i.InodeOperations.ListXattr(ctx, i)
+	return i.InodeOperations.ListXattr(ctx, i, size)
+}
+
+// RemoveXattr calls i.InodeOperations.RemoveXattr with i as the Inode.
+func (i *Inode) RemoveXattr(ctx context.Context, d *Dirent, name string) error {
+	if i.overlay != nil {
+		return overlayRemoveXattr(ctx, i.overlay, d, name)
+	}
+	return i.InodeOperations.RemoveXattr(ctx, i, name)
 }
 
 // CheckPermission will check if the caller may access this file in the
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 70f2eae96..2bbfb72ef 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -190,7 +190,18 @@ type InodeOperations interface {
 	// ListXattr returns the set of all extended attributes names that
 	// have values. Inodes that do not support extended attributes return
 	// EOPNOTSUPP.
-	ListXattr(ctx context.Context, inode *Inode) (map[string]struct{}, error)
+	//
+	// If this is called through the listxattr(2) syscall, size indicates the
+	// size of the buffer that the application has allocated to hold the
+	// attribute list. If the list would be larger than size, implementations may
+	// return ERANGE to indicate that the buffer is too small, but they are also
+	// free to ignore the hint entirely. All size checking is done independently
+	// at the syscall layer.
+	ListXattr(ctx context.Context, inode *Inode, size uint64) (map[string]struct{}, error)
+
+	// RemoveXattr removes an extended attribute specified by name. Inodes that
+	// do not support extended attributes return EOPNOTSUPP.
+	RemoveXattr(ctx context.Context, inode *Inode, name string) error
 
 	// Check determines whether an Inode can be accessed with the
 	// requested permission mask using the context (which gives access
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 4729b4aac..5ada33a32 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -564,15 +564,15 @@ func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, valu
 	return o.upper.SetXattr(ctx, d, name, value, flags)
 }
 
-func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}, error) {
+func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[string]struct{}, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
 	var names map[string]struct{}
 	var err error
 	if o.upper != nil {
-		names, err = o.upper.ListXattr(ctx)
+		names, err = o.upper.ListXattr(ctx, size)
 	} else {
-		names, err = o.lower.ListXattr(ctx)
+		names, err = o.lower.ListXattr(ctx, size)
 	}
 	for name := range names {
 		// Same as overlayGetXattr, we shouldn't forward along
@@ -584,6 +584,18 @@ func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}
 	return names, err
 }
 
+func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error {
+	// Don't allow changes to overlay xattrs through a removexattr syscall.
+	if strings.HasPrefix(XattrOverlayPrefix, name) {
+		return syserror.EPERM
+	}
+
+	if err := copyUp(ctx, d); err != nil {
+		return err
+	}
+	return o.upper.RemoveXattr(ctx, d, name)
+}
+
 func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error {
 	o.copyMu.RLock()
 	// Hot path. Avoid defers.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index c00cef0a5..3c2b583ae 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -159,8 +159,13 @@ func (d *Dir) SetXattr(ctx context.Context, i *fs.Inode, name, value string, fla
 }
 
 // ListXattr implements fs.InodeOperations.ListXattr.
-func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode) (map[string]struct{}, error) {
-	return d.ramfsDir.ListXattr(ctx, i)
+func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode, size uint64) (map[string]struct{}, error) {
+	return d.ramfsDir.ListXattr(ctx, i, size)
+}
+
+// RemoveXattr implements fs.InodeOperations.RemoveXattr.
+func (d *Dir) RemoveXattr(ctx context.Context, i *fs.Inode, name string) error {
+	return d.ramfsDir.RemoveXattr(ctx, i, name)
 }
 
 // Lookup implements fs.InodeOperations.Lookup.
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 588f8b087..79066ad2a 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,21 +228,18 @@ var AMD64 = &kernel.SyscallTable{
 		185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
-		// TODO(b/148303075): Enable set/getxattr (in their various
-		// forms) once we also have list and removexattr. The JVM
-		// assumes that if get/set exist, then list and remove do too.
-		188: syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		189: syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		190: syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
 		200: syscalls.Supported("tkill", Tkill),
 		201: syscalls.Supported("time", Time),
 		202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index 06e5ee401..7421619de 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -36,26 +36,23 @@ var ARM64 = &kernel.SyscallTable{
 	},
 	AuditNumber: linux.AUDIT_ARCH_AARCH64,
 	Table: map[uintptr]kernel.Syscall{
-		0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
-		// TODO(b/148303075): Enable set/getxattr (in their various
-		// forms) once we also have list and removexattr. The JVM
-		// assumes that if get/set exist, then list and remove do too.
-		5:   syscalls.ErrorWithEvent("setxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		6:   syscalls.ErrorWithEvent("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		7:   syscalls.ErrorWithEvent("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		8:   syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		13:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		14:  syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		15:  syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
-		16:  syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", []string{"gvisor.dev/issue/1636"}),
+		0:   syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		1:   syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		2:   syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
+		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
+		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
+		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
+		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
+		11:  syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil),
+		12:  syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil),
+		13:  syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil),
+		14:  syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil),
+		15:  syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil),
+		16:  syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil),
 		17:  syscalls.Supported("getcwd", Getcwd),
 		18:  syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
 		19:  syscalls.Supported("eventfd2", Eventfd2),
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index efb95555c..342337726 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -72,7 +72,7 @@ func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink
 	}
 
 	valueLen := 0
-	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
 		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 			return syserror.ENOTDIR
 		}
@@ -172,7 +172,7 @@ func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink
 		return 0, nil, err
 	}
 
-	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
 		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
 			return syserror.ENOTDIR
 		}
@@ -187,12 +187,12 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, si
 		return syserror.EINVAL
 	}
 
-	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
 		return err
 	}
 
-	name, err := copyInXattrName(t, nameAddr)
-	if err != nil {
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
 		return err
 	}
 
@@ -226,12 +226,18 @@ func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
 	return name, nil
 }
 
+// Restrict xattrs to regular files and directories.
+//
+// TODO(b/148380782): In Linux, this restriction technically only applies to
+// xattrs in the "user.*" namespace. Make file type checks specific to the
+// namespace once we allow other xattr prefixes.
+func xattrFileTypeOk(i *fs.Inode) bool {
+	return fs.IsRegular(i.StableAttr) || fs.IsDir(i.StableAttr)
+}
+
 func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error {
 	// Restrict xattrs to regular files and directories.
-	//
-	// In Linux, this restriction technically only applies to xattrs in the
-	// "user.*" namespace, but we don't allow any other xattr prefixes anyway.
-	if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) {
+	if !xattrFileTypeOk(i) {
 		if perms.Write {
 			return syserror.EPERM
 		}
@@ -240,3 +246,179 @@ func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error
 
 	return i.CheckPermission(t, perms)
 }
+
+// ListXattr implements linux syscall listxattr(2).
+func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listXattrFromPath(t, args, true)
+}
+
+// LListXattr implements linux syscall llistxattr(2).
+func LListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return listXattrFromPath(t, args, false)
+}
+
+// FListXattr implements linux syscall flistxattr(2).
+func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	listAddr := args[1].Pointer()
+	size := uint64(args[2].SizeT())
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	n, err := listXattr(t, f.Dirent, listAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+func listXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	listAddr := args[1].Pointer()
+	size := uint64(args[2].SizeT())
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n := 0
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		n, err = listXattr(t, d, listAddr, size)
+		return err
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(n), nil, nil
+}
+
+func listXattr(t *kernel.Task, d *fs.Dirent, addr usermem.Addr, size uint64) (int, error) {
+	if !xattrFileTypeOk(d.Inode) {
+		return 0, nil
+	}
+
+	// If listxattr(2) is called with size 0, the buffer size needed to contain
+	// the xattr list will be returned successfully even if it is nonzero. In
+	// that case, we need to retrieve the entire list so we can compute and
+	// return the correct size.
+	requestedSize := size
+	if size == 0 || size > linux.XATTR_SIZE_MAX {
+		requestedSize = linux.XATTR_SIZE_MAX
+	}
+	xattrs, err := d.Inode.ListXattr(t, requestedSize)
+	if err != nil {
+		return 0, err
+	}
+
+	// TODO(b/148380782): support namespaces other than "user".
+	for x := range xattrs {
+		if !strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+			delete(xattrs, x)
+		}
+	}
+
+	listSize := xattrListSize(xattrs)
+	if listSize > linux.XATTR_SIZE_MAX {
+		return 0, syserror.E2BIG
+	}
+	if uint64(listSize) > requestedSize {
+		return 0, syserror.ERANGE
+	}
+
+	// Don't copy out the attributes if size is 0.
+	if size == 0 {
+		return listSize, nil
+	}
+
+	buf := make([]byte, 0, listSize)
+	for x := range xattrs {
+		buf = append(buf, []byte(x)...)
+		buf = append(buf, 0)
+	}
+	if _, err := t.CopyOutBytes(addr, buf); err != nil {
+		return 0, err
+	}
+
+	return len(buf), nil
+}
+
+func xattrListSize(xattrs map[string]struct{}) int {
+	size := 0
+	for x := range xattrs {
+		size += len(x) + 1
+	}
+	return size
+}
+
+// RemoveXattr implements linux syscall removexattr(2).
+func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return removeXattrFromPath(t, args, true)
+}
+
+// LRemoveXattr implements linux syscall lremovexattr(2).
+func LRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return removeXattrFromPath(t, args, false)
+}
+
+// FRemoveXattr implements linux syscall fremovexattr(2).
+func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	return 0, nil, removeXattr(t, f.Dirent, nameAddr)
+}
+
+func removeXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
+	pathAddr := args[0].Pointer()
+	nameAddr := args[1].Pointer()
+
+	path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		return removeXattr(t, d, nameAddr)
+	})
+}
+
+// removeXattr implements removexattr(2) from the given *fs.Dirent.
+func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
+	name, err := copyInXattrName(t, nameAddr)
+	if err != nil {
+		return err
+	}
+
+	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+		return err
+	}
+
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+
+	return d.Inode.RemoveXattr(t, d, name)
+}
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 4d84ad999..cadd83273 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -768,12 +768,22 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 }
 
 // TODO(b/127675828): support getxattr.
-func (l *localFile) GetXattr(name string, size uint64) (string, error) {
+func (*localFile) GetXattr(string, uint64) (string, error) {
 	return "", syscall.EOPNOTSUPP
 }
 
 // TODO(b/127675828): support setxattr.
-func (l *localFile) SetXattr(name, value string, flags uint32) error {
+func (*localFile) SetXattr(string, string, uint32) error {
+	return syscall.EOPNOTSUPP
+}
+
+// TODO(b/148303075): support listxattr.
+func (*localFile) ListXattr(uint64) (map[string]struct{}, error) {
+	return nil, syscall.EOPNOTSUPP
+}
+
+// TODO(b/148303075): support removexattr.
+func (*localFile) RemoveXattr(string) error {
 	return syscall.EOPNOTSUPP
 }
 
@@ -790,7 +800,7 @@ func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error
 }
 
 // Rename implements p9.File; this should never be called.
-func (l *localFile) Rename(p9.File, string) error {
+func (*localFile) Rename(p9.File, string) error {
 	panic("rename called directly")
 }
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 12d389c3e..ca1af209a 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3782,6 +3782,7 @@ cc_binary(
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         gtest,
         "//test/util:posix_error",
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 85eb31847..8b00ef44c 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -24,6 +24,7 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include "absl/container/flat_hash_set.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
@@ -38,36 +39,36 @@ namespace {
 
 class XattrTest : public FileTest {};
 
-TEST_F(XattrTest, XattrNullName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+TEST_F(XattrTest, XattrNonexistentFile) {
+  const char* path = "/does/not/exist";
+  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
+              SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+              SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(listxattr(path, nullptr, 0), SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(ENOENT));
+}
 
+TEST_F(XattrTest, XattrNullName) {
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EFAULT));
   EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
               SyscallFailsWithErrno(EFAULT));
+  EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(EFAULT));
 }
 
 TEST_F(XattrTest, XattrEmptyName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
 
   EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(ERANGE));
   EXPECT_THAT(getxattr(path, "", nullptr, 0), SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(removexattr(path, ""), SyscallFailsWithErrno(ERANGE));
 }
 
 TEST_F(XattrTest, XattrLargeName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
@@ -86,28 +87,23 @@ TEST_F(XattrTest, XattrLargeName) {
               SyscallFailsWithErrno(ERANGE));
   EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
               SyscallFailsWithErrno(ERANGE));
+  EXPECT_THAT(removexattr(path, name.c_str()), SyscallFailsWithErrno(ERANGE));
 }
 
 TEST_F(XattrTest, XattrInvalidPrefix) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   std::string name(XATTR_NAME_MAX, 'a');
   EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EOPNOTSUPP));
   EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
               SyscallFailsWithErrno(EOPNOTSUPP));
+  EXPECT_THAT(removexattr(path, name.c_str()),
+              SyscallFailsWithErrno(EOPNOTSUPP));
 }
 
 // Do not allow save/restore cycles after making the test file read-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -124,19 +120,21 @@ TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
 
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
               SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EACCES));
 
   char buf = '-';
   EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
 }
 
 // Do not allow save/restore cycles after making the test file write-only, as
 // the restore will fail to open it with r/w permissions.
 TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -152,6 +150,14 @@ TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
+
+  // listxattr will succeed even without read permissions.
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrTrustedWithNonadmin) {
@@ -163,64 +169,66 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) {
   const char name[] = "trusted.abc";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, XattrOnDirectory) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const char name[] = "user.test";
-  EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(dir.path().c_str(), name, nullptr, 0, /*flags=*/0),
               SyscallSucceeds());
-  EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0),
+  EXPECT_THAT(getxattr(dir.path().c_str(), name, nullptr, 0),
               SyscallSucceedsWithValue(0));
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(dir.path().c_str(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(dir.path().c_str(), name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrOnSymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
   const char name[] = "user.test";
-  EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(link.path().c_str(), name, nullptr, 0, /*flags=*/0),
               SyscallSucceeds());
-  EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0),
+  EXPECT_THAT(getxattr(link.path().c_str(), name, nullptr, 0),
               SyscallSucceedsWithValue(0));
+
+  char list[sizeof(name)];
+  EXPECT_THAT(listxattr(link.path().c_str(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(removexattr(link.path().c_str(), name), SyscallSucceeds());
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char name[] = "user.test";
 
   char char_device[] = "/dev/zero";
-  EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(char_device, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(getxattr(char_device, name, NULL, 0),
+  EXPECT_THAT(getxattr(char_device, name, nullptr, 0),
               SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(listxattr(char_device, nullptr, 0), SyscallSucceedsWithValue(0));
 
   // Use tmpfs, where creation of named pipes is supported.
   const std::string fifo = NewTempAbsPathInDir("/dev/shm");
   const char* path = fifo.c_str();
   EXPECT_THAT(mknod(path, S_IFIFO | S_IRUSR | S_IWUSR, 0), SyscallSucceeds());
-  EXPECT_THAT(setxattr(path, name, NULL, 0, /*flags=*/0),
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(getxattr(path, name, NULL, 0), SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(listxattr(path, nullptr, 0), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
 }
 
 TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -236,10 +244,6 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, SetxattrZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -252,10 +256,6 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
 
@@ -271,10 +271,6 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
@@ -284,10 +280,6 @@ TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -296,10 +288,6 @@ TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
@@ -316,10 +304,6 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -335,10 +319,6 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithLarger) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -353,10 +333,6 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
 }
 
 TEST_F(XattrTest, SetxattrCreateFlag) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -368,10 +344,6 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceFlag) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
@@ -384,10 +356,6 @@ TEST_F(XattrTest, SetxattrReplaceFlag) {
 }
 
 TEST_F(XattrTest, SetxattrInvalidFlags) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   int invalid_flags = 0xff;
   EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
@@ -395,10 +363,6 @@ TEST_F(XattrTest, SetxattrInvalidFlags) {
 }
 
 TEST_F(XattrTest, Getxattr) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -411,10 +375,6 @@ TEST_F(XattrTest, Getxattr) {
 }
 
 TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
@@ -427,10 +387,6 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -446,10 +402,6 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -463,10 +415,6 @@ TEST_F(XattrTest, GetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrSizeTooLarge) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -483,10 +431,6 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, GetxattrNullValue) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -498,10 +442,6 @@ TEST_F(XattrTest, GetxattrNullValue) {
 }
 
 TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   char val = 'a';
@@ -518,35 +458,109 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrNonexistentName) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, Listxattr) {
+  const char* path = test_file_name_.c_str();
+  const std::string name = "user.test";
+  const std::string name2 = "user.test2";
+  const std::string name3 = "user.test3";
+  EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name2.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
+  EXPECT_THAT(setxattr(path, name3.c_str(), nullptr, 0, /*flags=*/0),
+              SyscallSucceeds());
 
+  std::vector<char> list(name.size() + 1 + name2.size() + 1 + name3.size() + 1);
+  char* buf = list.data();
+  EXPECT_THAT(listxattr(path, buf, XATTR_SIZE_MAX),
+              SyscallSucceedsWithValue(list.size()));
+
+  absl::flat_hash_set<std::string> got = {};
+  for (char* p = buf; p < buf + list.size(); p += strlen(p) + 1) {
+    got.insert(std::string{p});
+  }
+
+  absl::flat_hash_set<std::string> expected = {name, name2, name3};
+  EXPECT_EQ(got, expected);
+}
+
+TEST_F(XattrTest, ListxattrNoXattrs) {
+  const char* path = test_file_name_.c_str();
+
+  std::vector<char> list, expected;
+  EXPECT_THAT(listxattr(path, list.data(), sizeof(list)),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(list, expected);
+
+  // Listxattr should succeed if there are no attributes, even if the buffer
+  // passed in is a nullptr.
+  EXPECT_THAT(listxattr(path, nullptr, sizeof(list)),
+              SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, ListxattrNullBuffer) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+  EXPECT_THAT(listxattr(path, nullptr, sizeof(name)),
+              SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, ListxattrSizeTooSmall) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+  char list[sizeof(name) - 1];
+  EXPECT_THAT(listxattr(path, list, sizeof(list)),
+              SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, ListxattrZeroSize) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+  EXPECT_THAT(listxattr(path, nullptr, 0),
+              SyscallSucceedsWithValue(sizeof(name)));
+}
+
+TEST_F(XattrTest, RemoveXattr) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+  EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
-TEST_F(XattrTest, LGetSetxattrOnSymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+TEST_F(XattrTest, RemoveXattrNonexistentName) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(ENODATA));
+}
 
+TEST_F(XattrTest, LXattrOnSymlink) {
+  const char name[] = "user.test";
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
 
-  EXPECT_THAT(lsetxattr(link.path().c_str(), nullptr, nullptr, 0, 0),
+  EXPECT_THAT(lsetxattr(link.path().c_str(), name, nullptr, 0, 0),
               SyscallFailsWithErrno(EPERM));
-  EXPECT_THAT(lgetxattr(link.path().c_str(), nullptr, nullptr, 0),
+  EXPECT_THAT(lgetxattr(link.path().c_str(), name, nullptr, 0),
               SyscallFailsWithErrno(ENODATA));
+  EXPECT_THAT(llistxattr(link.path().c_str(), nullptr, 0),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(lremovexattr(link.path().c_str(), name),
+              SyscallFailsWithErrno(EPERM));
 }
 
-TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
-
+TEST_F(XattrTest, LXattrOnNonsymlink) {
   const char* path = test_file_name_.c_str();
   const char name[] = "user.test";
   int val = 1234;
@@ -558,13 +572,16 @@ TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
   EXPECT_THAT(lgetxattr(path, name, &buf, size),
               SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
-}
 
-TEST_F(XattrTest, FGetSetxattr) {
-  // TODO(gvisor.dev/issue/1636): Re-enable once list/remove xattr are
-  // supported, and get/set have been added pack to the syscall table.
-  SKIP_IF(IsRunningOnGvisor());
+  char list[sizeof(name)];
+  EXPECT_THAT(llistxattr(path, list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
 
+  EXPECT_THAT(lremovexattr(path, name), SyscallSucceeds());
+}
+
+TEST_F(XattrTest, XattrWithFD) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
   const char name[] = "user.test";
@@ -577,6 +594,13 @@ TEST_F(XattrTest, FGetSetxattr) {
   EXPECT_THAT(fgetxattr(fd.get(), name, &buf, size),
               SyscallSucceedsWithValue(size));
   EXPECT_EQ(buf, val);
+
+  char list[sizeof(name)];
+  EXPECT_THAT(flistxattr(fd.get(), list, sizeof(list)),
+              SyscallSucceedsWithValue(sizeof(name)));
+  EXPECT_STREQ(list, name);
+
+  EXPECT_THAT(fremovexattr(fd.get(), name), SyscallSucceeds());
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 75412ed9f5b6b327dec05ffff99d7fe6198d25a8 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Mon, 10 Feb 2020 10:28:56 -0800
Subject: Internal change.

PiperOrigin-RevId: 294250370
---
 test/syscalls/linux/inotify.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index fdef646eb..0e13ad190 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -1055,9 +1055,9 @@ TEST(Inotify, ChmodGeneratesAttribEvent_NoRandomSave) {
   const TempPath file1 =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
 
-  const FileDescriptor root_fd =
+  FileDescriptor root_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(root.path(), O_RDONLY));
-  const FileDescriptor file1_fd =
+  FileDescriptor file1_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
 
@@ -1091,6 +1091,11 @@ TEST(Inotify, ChmodGeneratesAttribEvent_NoRandomSave) {
   ASSERT_THAT(fchmodat(root_fd.get(), file1_basename.c_str(), S_IWGRP, 0),
               SyscallSucceeds());
   verify_chmod_events();
+
+  // Make sure the chmod'ed file descriptors are destroyed before DisableSave
+  // is destructed.
+  root_fd.reset();
+  file1_fd.reset();
 }
 
 TEST(Inotify, TruncateGeneratesModifyEvent) {
-- 
cgit v1.2.3


From 46a36b64d5164d1ac887aa528d23bb2f2c74489e Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 12 Feb 2020 06:35:20 -0800
Subject: Include more test files in exports_files

So that they can be included by Fuchsia's syscall tests

PiperOrigin-RevId: 294654890
---
 test/syscalls/linux/BUILD | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ca1af209a..e7c82adfc 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -10,13 +10,16 @@ exports_files(
         "socket.cc",
         "socket_inet_loopback.cc",
         "socket_ip_loopback_blocking.cc",
+        "socket_ip_tcp_generic_loopback.cc",
         "socket_ip_tcp_loopback.cc",
+        "socket_ip_tcp_udp_generic.cc",
         "socket_ip_udp_loopback.cc",
         "socket_ip_unbound.cc",
         "socket_ipv4_tcp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_loopback.cc",
         "tcp_socket.cc",
+        "udp_bind.cc",
         "udp_socket.cc",
     ],
     visibility = ["//:sandbox"],
-- 
cgit v1.2.3


From 69bf39e8a47d3b4dcbbd04d2e8df476cdfab5e74 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 13 Feb 2020 10:58:47 -0800
Subject: Internal change.

PiperOrigin-RevId: 294952610
---
 pkg/abi/linux/socket.go                        | 13 ++++
 pkg/sentry/socket/control/BUILD                |  1 +
 pkg/sentry/socket/control/control.go           | 43 +++++++++++++
 pkg/sentry/socket/hostinet/socket.go           | 11 +++-
 pkg/sentry/socket/netstack/netstack.go         | 37 ++++++++++--
 pkg/tcpip/tcpip.go                             | 25 ++++++++
 pkg/tcpip/transport/udp/endpoint.go            | 26 ++++++++
 test/syscalls/linux/socket_ip_udp_generic.cc   | 44 ++++++++++++++
 test/syscalls/linux/socket_ipv4_udp_unbound.cc | 84 ++++++++++++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.cc   |  1 -
 10 files changed, 278 insertions(+), 7 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 766ee4014..4a14ef691 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -411,6 +411,15 @@ type ControlMessageCredentials struct {
 	GID uint32
 }
 
+// A ControlMessageIPPacketInfo is IP_PKTINFO socket control message.
+//
+// ControlMessageIPPacketInfo represents struct in_pktinfo from linux/in.h.
+type ControlMessageIPPacketInfo struct {
+	NIC             int32
+	LocalAddr       InetAddr
+	DestinationAddr InetAddr
+}
+
 // SizeOfControlMessageCredentials is the binary size of a
 // ControlMessageCredentials struct.
 var SizeOfControlMessageCredentials = int(binary.Size(ControlMessageCredentials{}))
@@ -431,6 +440,10 @@ const SizeOfControlMessageTOS = 1
 // SizeOfControlMessageTClass is the size of an IPV6_TCLASS control message.
 const SizeOfControlMessageTClass = 4
 
+// SizeOfControlMessageIPPacketInfo is the size of an IP_PKTINFO
+// control message.
+const SizeOfControlMessageIPPacketInfo = 12
+
 // SCM_MAX_FD is the maximum number of FDs accepted in a single sendmsg call.
 // From net/scm.h.
 const SCM_MAX_FD = 253
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 79e16d6e8..4d42d29cb 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/syserror",
+        "//pkg/tcpip",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 6145a7fc3..4667373d2 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -338,6 +339,22 @@ func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
 	)
 }
 
+// PackIPPacketInfo packs an IP_PKTINFO socket control message.
+func PackIPPacketInfo(t *kernel.Task, packetInfo tcpip.IPPacketInfo, buf []byte) []byte {
+	var p linux.ControlMessageIPPacketInfo
+	p.NIC = int32(packetInfo.NIC)
+	copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr))
+	copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr))
+
+	return putCmsgStruct(
+		buf,
+		linux.SOL_IP,
+		linux.IP_PKTINFO,
+		t.Arch().Width(),
+		p,
+	)
+}
+
 // PackControlMessages packs control messages into the given buffer.
 //
 // We skip control messages specific to Unix domain sockets.
@@ -362,6 +379,10 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt
 		buf = PackTClass(t, cmsgs.IP.TClass, buf)
 	}
 
+	if cmsgs.IP.HasIPPacketInfo {
+		buf = PackIPPacketInfo(t, cmsgs.IP.PacketInfo, buf)
+	}
+
 	return buf
 }
 
@@ -394,6 +415,16 @@ func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int {
 	return space
 }
 
+// NewIPPacketInfo returns the IPPacketInfo struct.
+func NewIPPacketInfo(packetInfo linux.ControlMessageIPPacketInfo) tcpip.IPPacketInfo {
+	var p tcpip.IPPacketInfo
+	p.NIC = tcpip.NICID(packetInfo.NIC)
+	copy([]byte(p.LocalAddr), packetInfo.LocalAddr[:])
+	copy([]byte(p.DestinationAddr), packetInfo.DestinationAddr[:])
+
+	return p
+}
+
 // Parse parses a raw socket control message into portable objects.
 func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) {
 	var (
@@ -468,6 +499,18 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con
 				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
 				i += binary.AlignUp(length, width)
 
+			case linux.IP_PKTINFO:
+				if length < linux.SizeOfControlMessageIPPacketInfo {
+					return socket.ControlMessages{}, syserror.EINVAL
+				}
+
+				cmsgs.IP.HasIPPacketInfo = true
+				var packetInfo linux.ControlMessageIPPacketInfo
+				binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo)
+
+				cmsgs.IP.PacketInfo = NewIPPacketInfo(packetInfo)
+				i += binary.AlignUp(length, width)
+
 			default:
 				return socket.ControlMessages{}, syserror.EINVAL
 			}
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index de76388ac..22f78d2e2 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -289,7 +289,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
 	switch level {
 	case linux.SOL_IP:
 		switch name {
-		case linux.IP_TOS, linux.IP_RECVTOS:
+		case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO:
 			optlen = sizeofInt32
 		}
 	case linux.SOL_IPV6:
@@ -336,6 +336,8 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 		switch name {
 		case linux.IP_TOS, linux.IP_RECVTOS:
 			optlen = sizeofInt32
+		case linux.IP_PKTINFO:
+			optlen = linux.SizeOfControlMessageIPPacketInfo
 		}
 	case linux.SOL_IPV6:
 		switch name {
@@ -473,7 +475,14 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			case syscall.IP_TOS:
 				controlMessages.IP.HasTOS = true
 				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS)
+
+			case syscall.IP_PKTINFO:
+				controlMessages.IP.HasIPPacketInfo = true
+				var packetInfo linux.ControlMessageIPPacketInfo
+				binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo)
+				controlMessages.IP.PacketInfo = control.NewIPPacketInfo(packetInfo)
 			}
+
 		case syscall.SOL_IPV6:
 			switch unixCmsg.Header.Type {
 			case syscall.IPV6_TCLASS:
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index ed2fbcceb..9757fbfba 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1414,6 +1414,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return o, nil
 
+	case linux.IP_PKTINFO:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveIPPacketInfoOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1762,6 +1777,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		linux.IPV6_IPSEC_POLICY,
 		linux.IPV6_JOIN_ANYCAST,
 		linux.IPV6_LEAVE_ANYCAST,
+		// TODO(b/148887420): Add support for IPV6_PKTINFO.
 		linux.IPV6_PKTINFO,
 		linux.IPV6_ROUTER_ALERT,
 		linux.IPV6_XFRM_POLICY,
@@ -1949,6 +1965,16 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0))
 
+	case linux.IP_PKTINFO:
+		if len(optVal) == 0 {
+			return nil
+		}
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1964,7 +1990,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_NODEFRAG,
 		linux.IP_OPTIONS,
 		linux.IP_PASSSEC,
-		linux.IP_PKTINFO,
 		linux.IP_RECVERR,
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
@@ -2395,10 +2420,12 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 func (s *SocketOperations) controlMessages() socket.ControlMessages {
 	return socket.ControlMessages{
 		IP: tcpip.ControlMessages{
-			HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp,
-			Timestamp:    s.readCM.Timestamp,
-			HasTOS:       s.readCM.HasTOS,
-			TOS:          s.readCM.TOS,
+			HasTimestamp:    s.readCM.HasTimestamp && s.sockOptTimestamp,
+			Timestamp:       s.readCM.Timestamp,
+			HasTOS:          s.readCM.HasTOS,
+			TOS:             s.readCM.TOS,
+			HasIPPacketInfo: s.readCM.HasIPPacketInfo,
+			PacketInfo:      s.readCM.PacketInfo,
 		},
 	}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 0e944712f..9ca39ce40 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -328,6 +328,12 @@ type ControlMessages struct {
 
 	// Tclass is the IPv6 traffic class of the associated packet.
 	TClass int32
+
+	// HasIPPacketInfo indicates whether PacketInfo is set.
+	HasIPPacketInfo bool
+
+	// PacketInfo holds interface and address data on an incoming packet.
+	PacketInfo IPPacketInfo
 }
 
 // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
@@ -503,6 +509,11 @@ const (
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
 	V6OnlyOption
+
+	// ReceiveIPPacketInfoOption is used by {G,S}etSockOptBool to specify
+	// if more inforamtion is provided with incoming packets such
+	// as interface index and address.
+	ReceiveIPPacketInfoOption
 )
 
 // SockOptInt represents socket options which values have the int type.
@@ -685,6 +696,20 @@ type IPv4TOSOption uint8
 // for all subsequent outgoing IPv6 packets from the endpoint.
 type IPv6TrafficClassOption uint8
 
+// IPPacketInfo is the message struture for IP_PKTINFO.
+//
+// +stateify savable
+type IPPacketInfo struct {
+	// NIC is the ID of the NIC to be used.
+	NIC NICID
+
+	// LocalAddr is the local address.
+	LocalAddr Address
+
+	// DestinationAddr is the destination address.
+	DestinationAddr Address
+}
+
 // Route is a row in the routing table. It specifies through which NIC (and
 // gateway) sets of packets should be routed. A row is considered viable if the
 // masked target address matches the destination address in the row.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index c9cbed8f4..3fe91cac2 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -29,6 +29,7 @@ import (
 type udpPacket struct {
 	udpPacketEntry
 	senderAddress tcpip.FullAddress
+	packetInfo    tcpip.IPPacketInfo
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
 	tos           uint8
@@ -118,6 +119,9 @@ type endpoint struct {
 	// as ancillary data to ControlMessages on Read.
 	receiveTOS bool
 
+	// receiveIPPacketInfo determines if the packet info is returned by Read.
+	receiveIPPacketInfo bool
+
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -254,11 +258,17 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	}
 	e.mu.RLock()
 	receiveTOS := e.receiveTOS
+	receiveIPPacketInfo := e.receiveIPPacketInfo
 	e.mu.RUnlock()
 	if receiveTOS {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
+
+	if receiveIPPacketInfo {
+		cm.HasIPPacketInfo = true
+		cm.PacketInfo = p.packetInfo
+	}
 	return p.data.ToView(), cm, nil
 }
 
@@ -495,6 +505,13 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		}
 
 		e.v6only = v
+		return nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.Lock()
+		e.receiveIPPacketInfo = v
+		e.mu.Unlock()
+		return nil
 	}
 
 	return nil
@@ -703,6 +720,12 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 
 		return v, nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.RLock()
+		v := e.receiveIPPacketInfo
+		e.mu.RUnlock()
+		return v, nil
 	}
 
 	return false, tcpip.ErrUnknownProtocolOption
@@ -1247,6 +1270,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	switch r.NetProto {
 	case header.IPv4ProtocolNumber:
 		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+		packet.packetInfo.LocalAddr = r.LocalAddress
+		packet.packetInfo.DestinationAddr = r.RemoteAddress
+		packet.packetInfo.NIC = r.NICID()
 	}
 
 	packet.timestamp = e.stack.NowNanoseconds()
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 53290bed7..db5663ecd 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -357,5 +357,49 @@ TEST_P(UDPSocketPairTest, SetReuseAddrReusePort) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
+// Test getsockopt for a socket which is not set with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, IPPKTINFODefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), SOL_IP, IP_PKTINFO, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test setsockopt and getsockopt for a socket with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int level = SOL_IP;
+  int type = IP_PKTINFO;
+
+  // Check getsockopt before IP_PKTINFO is set.
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOn);
+  EXPECT_EQ(get_len, sizeof(get));
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceedsWithValue(0));
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOff);
+  EXPECT_EQ(get_len, sizeof(get));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 990ccf23c..bc4b07a62 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -15,6 +15,7 @@
 #include "test/syscalls/linux/socket_ipv4_udp_unbound.h"
 
 #include <arpa/inet.h>
+#include <net/if.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -2128,5 +2129,88 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
               SyscallSucceedsWithValue(kMessageSize));
 }
 
+// Test that socket will receive packet info control message.
+TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) {
+  // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+  SKIP_IF((IsRunningWithHostinet()));
+
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto sender_addr = V4Loopback();
+  int level = SOL_IP;
+  int type = IP_PKTINFO;
+
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+           sender_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t sender_addr_len = sender_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&sender_addr.addr),
+                          &sender_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(sender_addr_len, sender_addr.addr_len);
+
+  auto receiver_addr = V4Loopback();
+  reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&sender_addr.addr)->sin_port;
+  ASSERT_THAT(
+      connect(sender->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+              receiver_addr.addr_len),
+      SyscallSucceeds());
+
+  // Allow socket to receive control message.
+  ASSERT_THAT(
+      setsockopt(receiver->get(), level, type, &kSockOptOn, sizeof(kSockOptOn)),
+      SyscallSucceeds());
+
+  // Prepare message to send.
+  constexpr size_t kDataLength = 1024;
+  msghdr sent_msg = {};
+  iovec sent_iov = {};
+  char sent_data[kDataLength];
+  sent_iov.iov_base = sent_data;
+  sent_iov.iov_len = kDataLength;
+  sent_msg.msg_iov = &sent_iov;
+  sent_msg.msg_iovlen = 1;
+  sent_msg.msg_flags = 0;
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(sender->get(), &sent_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  msghdr received_msg = {};
+  iovec received_iov = {};
+  char received_data[kDataLength];
+  char received_cmsg_buf[CMSG_SPACE(sizeof(in_pktinfo))] = {};
+  size_t cmsg_data_len = sizeof(in_pktinfo);
+  received_iov.iov_base = received_data;
+  received_iov.iov_len = kDataLength;
+  received_msg.msg_iov = &received_iov;
+  received_msg.msg_iovlen = 1;
+  received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+  received_msg.msg_control = received_cmsg_buf;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(receiver->get(), &received_msg, 0),
+              SyscallSucceedsWithValue(kDataLength));
+
+  cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+  ASSERT_NE(cmsg, nullptr);
+  EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+  EXPECT_EQ(cmsg->cmsg_level, level);
+  EXPECT_EQ(cmsg->cmsg_type, type);
+
+  // Get loopback index.
+  ifreq ifr = {};
+  absl::SNPrintF(ifr.ifr_name, IFNAMSIZ, "lo");
+  ASSERT_THAT(ioctl(sender->get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  ASSERT_NE(ifr.ifr_ifindex, 0);
+
+  // Check the data
+  in_pktinfo received_pktinfo = {};
+  memcpy(&received_pktinfo, CMSG_DATA(cmsg), sizeof(in_pktinfo));
+  EXPECT_EQ(received_pktinfo.ipi_ifindex, ifr.ifr_ifindex);
+  EXPECT_EQ(received_pktinfo.ipi_spec_dst.s_addr, htonl(INADDR_LOOPBACK));
+  EXPECT_EQ(received_pktinfo.ipi_addr.s_addr, htonl(INADDR_LOOPBACK));
+}
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index a2f6ef8cc..9f8de6b48 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1495,6 +1495,5 @@ TEST_P(UdpSocketTest, SendAndReceiveTOS) {
   memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
   EXPECT_EQ(received_tos, sent_tos);
 }
-
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From c841373013ec8659b2954563796479f275b00bfa Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 12:00:58 -0800
Subject: Deflake fallocate syscall test.

- Retry if fallocate returns EINTR.

- If fallocate fails, don't try to fstat and confirm the result.

PiperOrigin-RevId: 295789790
---
 test/syscalls/linux/fallocate.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
index 1c3d00287..7819f4ac3 100644
--- a/test/syscalls/linux/fallocate.cc
+++ b/test/syscalls/linux/fallocate.cc
@@ -33,7 +33,7 @@ namespace testing {
 namespace {
 
 int fallocate(int fd, int mode, off_t offset, off_t len) {
-  return syscall(__NR_fallocate, fd, mode, offset, len);
+  return RetryEINTR(syscall)(__NR_fallocate, fd, mode, offset, len);
 }
 
 class AllocateTest : public FileTest {
@@ -47,27 +47,27 @@ TEST_F(AllocateTest, Fallocate) {
   EXPECT_EQ(buf.st_size, 0);
 
   // Grow to ten bytes.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 10), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 10), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 10);
 
   // Allocate to a smaller size should be noop.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 5), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 5), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 10);
 
   // Grow again.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 20), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 20), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 20);
 
   // Grow with offset.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 10, 20), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 10, 20), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 30);
 
   // Grow with offset beyond EOF.
-  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds());
+  ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds());
   ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
   EXPECT_EQ(buf.st_size, 40);
 }
-- 
cgit v1.2.3


From 56fd9504aab44a738d3df164cbee8e572b309f28 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Tue, 18 Feb 2020 15:44:22 -0800
Subject: Enable IPV6_RECVTCLASS socket option for datagram sockets

Added the ability to get/set the IP_RECVTCLASS socket option on UDP endpoints.
If enabled, traffic class from the incoming Network Header passed as ancillary
data in the ControlMessages.

Adding Get/SetSockOptBool to decrease the overhead of getting/setting simple
options. (This was absorbed in a CL that will be landing before this one).

Test:
* Added unit test to udp_test.go that tests getting/setting as well as
verifying that we receive expected TOS from incoming packet.
* Added a syscall test for verifying getting/setting
* Removed test skip for existing syscall test to enable end to end test.
PiperOrigin-RevId: 295840218
---
 pkg/sentry/socket/control/control.go         |   2 +-
 pkg/sentry/socket/netstack/netstack.go       |  27 +++++-
 pkg/tcpip/checker/checker.go                 |  14 +++
 pkg/tcpip/tcpip.go                           |  15 ++-
 pkg/tcpip/transport/udp/endpoint.go          |  38 +++++++-
 pkg/tcpip/transport/udp/udp_test.go          | 120 ++++++++++++++----------
 test/syscalls/linux/ip_socket_test_util.h    |  16 ++--
 test/syscalls/linux/socket_ip_udp_generic.cc | 133 +++++++++++++++++++--------
 test/syscalls/linux/udp_socket_test_cases.cc |   4 -
 9 files changed, 260 insertions(+), 109 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 4667373d2..8834a1e1a 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -329,7 +329,7 @@ func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
 }
 
 // PackTClass packs an IPV6_TCLASS socket control message.
-func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
+func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IPV6,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 9757fbfba..e187276c5 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1318,6 +1318,22 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		}
 		return ib, nil
 
+	case linux.IPV6_RECVTCLASS:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveTClassOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -1803,6 +1819,14 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v)))
 
+	case linux.IPV6_RECVTCLASS:
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0))
+
 	default:
 		emitUnimplementedEventIPv6(t, name)
 	}
@@ -2086,7 +2110,6 @@ func emitUnimplementedEventIPv6(t *kernel.Task, name int) {
 		linux.IPV6_RECVPATHMTU,
 		linux.IPV6_RECVPKTINFO,
 		linux.IPV6_RECVRTHDR,
-		linux.IPV6_RECVTCLASS,
 		linux.IPV6_RTHDR,
 		linux.IPV6_RTHDRDSTOPTS,
 		linux.IPV6_TCLASS,
@@ -2424,6 +2447,8 @@ func (s *SocketOperations) controlMessages() socket.ControlMessages {
 			Timestamp:       s.readCM.Timestamp,
 			HasTOS:          s.readCM.HasTOS,
 			TOS:             s.readCM.TOS,
+			HasTClass:       s.readCM.HasTClass,
+			TClass:          s.readCM.TClass,
 			HasIPPacketInfo: s.readCM.HasIPPacketInfo,
 			PacketInfo:      s.readCM.PacketInfo,
 		},
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 4d6ae0871..c6c160dfc 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -161,6 +161,20 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
+// ReceiveTClass creates a checker that checks the TCLASS field in
+// ControlMessages.
+func ReceiveTClass(want uint32) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTClass {
+			t.Fatalf("got cm.HasTClass = %t, want cm.TClass = %d", cm.HasTClass, want)
+		}
+		if got := cm.TClass; got != want {
+			t.Fatalf("got cm.TClass = %d, want %d", got, want)
+		}
+	}
+}
+
 // ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
 func ReceiveTOS(want uint8) ControlMessagesChecker {
 	return func(t *testing.T, cm tcpip.ControlMessages) {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 9ca39ce40..ce5527391 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -323,11 +323,11 @@ type ControlMessages struct {
 	// TOS is the IPv4 type of service of the associated packet.
 	TOS uint8
 
-	// HasTClass indicates whether Tclass is valid/set.
+	// HasTClass indicates whether TClass is valid/set.
 	HasTClass bool
 
-	// Tclass is the IPv6 traffic class of the associated packet.
-	TClass int32
+	// TClass is the IPv6 traffic class of the associated packet.
+	TClass uint32
 
 	// HasIPPacketInfo indicates whether PacketInfo is set.
 	HasIPPacketInfo bool
@@ -502,9 +502,13 @@ type WriteOptions struct {
 type SockOptBool int
 
 const (
+	// ReceiveTClassOption is used by SetSockOpt/GetSockOpt to specify if the
+	// IPV6_TCLASS ancillary message is passed with incoming packets.
+	ReceiveTClassOption SockOptBool = iota
+
 	// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
 	// ancillary message is passed with incoming packets.
-	ReceiveTOSOption SockOptBool = iota
+	ReceiveTOSOption
 
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
@@ -514,6 +518,9 @@ const (
 	// if more inforamtion is provided with incoming packets such
 	// as interface index and address.
 	ReceiveIPPacketInfoOption
+
+	// TODO(b/146901447): convert existing bool socket options to be handled via
+	// Get/SetSockOptBool
 )
 
 // SockOptInt represents socket options which values have the int type.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 3fe91cac2..eff7f3600 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -32,7 +32,8 @@ type udpPacket struct {
 	packetInfo    tcpip.IPPacketInfo
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	tos           uint8
+	// tos stores either the receiveTOS or receiveTClass value.
+	tos uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -119,6 +120,10 @@ type endpoint struct {
 	// as ancillary data to ControlMessages on Read.
 	receiveTOS bool
 
+	// receiveTClass determines if the incoming IPv6 TClass header field is
+	// passed as ancillary data to ControlMessages on Read.
+	receiveTClass bool
+
 	// receiveIPPacketInfo determines if the packet info is returned by Read.
 	receiveIPPacketInfo bool
 
@@ -258,13 +263,18 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	}
 	e.mu.RLock()
 	receiveTOS := e.receiveTOS
+	receiveTClass := e.receiveTClass
 	receiveIPPacketInfo := e.receiveIPPacketInfo
 	e.mu.RUnlock()
 	if receiveTOS {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
-
+	if receiveTClass {
+		cm.HasTClass = true
+		// Although TClass is an 8-bit value it's read in the CMsg as a uint32.
+		cm.TClass = uint32(p.tos)
+	}
 	if receiveIPPacketInfo {
 		cm.HasIPPacketInfo = true
 		cm.PacketInfo = p.packetInfo
@@ -490,6 +500,17 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return tcpip.ErrNotSupported
+		}
+
+		e.mu.Lock()
+		e.receiveTClass = v
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -709,6 +730,17 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 		return v, nil
 
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return false, tcpip.ErrNotSupported
+		}
+
+		e.mu.RLock()
+		v := e.receiveTClass
+		e.mu.RUnlock()
+		return v, nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -1273,6 +1305,8 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		packet.packetInfo.LocalAddr = r.LocalAddress
 		packet.packetInfo.DestinationAddr = r.RemoteAddress
 		packet.packetInfo.NIC = r.NICID()
+	case header.IPv6ProtocolNumber:
+		packet.tos, _ = header.IPv6(pkt.NetworkHeader).TOS()
 	}
 
 	packet.timestamp = e.stack.NowNanoseconds()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index f0ff3fe71..34b7c2360 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -409,6 +409,7 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	// Initialize the IP header.
 	ip := header.IPv6(buf)
 	ip.Encode(&header.IPv6Fields{
+		TrafficClass:  testTOS,
 		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
 		NextHeader:    uint8(udp.ProtocolNumber),
 		HopLimit:      65,
@@ -1336,7 +1337,7 @@ func TestSetTTL(t *testing.T) {
 	}
 }
 
-func TestTOSV4(t *testing.T) {
+func TestSetTOS(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4, multicastV4, broadcast} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1347,23 +1348,23 @@ func TestTOSV4(t *testing.T) {
 			const tos = testTOS
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
 			}
 
 			if err := c.ep.SetSockOpt(tcpip.IPv4TOSOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt(%#v) failed: %s", tcpip.IPv4TOSOption(tos), err)
+				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv4TOSOption(tos), err)
 			}
 
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 
 			if want := tcpip.IPv4TOSOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
 			}
 
 			testWrite(c, flow, checker.TOS(tos, 0))
@@ -1371,7 +1372,7 @@ func TestTOSV4(t *testing.T) {
 	}
 }
 
-func TestTOSV6(t *testing.T) {
+func TestSetTClass(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, broadcastIn6} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
 			c := newDualTestContext(t, defaultMTU)
@@ -1379,71 +1380,92 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = testTOS
+			const tClass = testTOS
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 			// Test for expected default value.
 			if v != 0 {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, 0)
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, 0)
 			}
 
-			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tos)); err != nil {
-				c.t.Errorf("SetSockOpt failed: %s", err)
+			if err := c.ep.SetSockOpt(tcpip.IPv6TrafficClassOption(tClass)); err != nil {
+				c.t.Errorf("SetSockOpt(%T, 0x%x) failed: %s", v, tcpip.IPv6TrafficClassOption(tClass), err)
 			}
 
 			if err := c.ep.GetSockOpt(&v); err != nil {
-				c.t.Errorf("GetSockopt failed: %s", err)
+				c.t.Errorf("GetSockopt(%T) failed: %s", v, err)
 			}
 
-			if want := tcpip.IPv6TrafficClassOption(tos); v != want {
-				c.t.Errorf("got GetSockOpt(...) = %#v, want = %#v", v, want)
+			if want := tcpip.IPv6TrafficClassOption(tClass); v != want {
+				c.t.Errorf("got GetSockOpt(%T) = 0x%x, want = 0x%x", v, v, want)
 			}
 
-			testWrite(c, flow, checker.TOS(tos, 0))
+			// The header getter for TClass is called TOS, so use that checker.
+			testWrite(c, flow, checker.TOS(tClass, 0))
 		})
 	}
 }
 
-func TestReceiveTOSV4(t *testing.T) {
-	for _, flow := range []testFlow{unicastV4, broadcast} {
-		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
-			c := newDualTestContext(t, defaultMTU)
-			defer c.cleanup()
+func TestReceiveTosTClass(t *testing.T) {
+	testCases := []struct {
+		name             string
+		getReceiveOption tcpip.SockOptBool
+		tests            []testFlow
+	}{
+		{"ReceiveTosOption", tcpip.ReceiveTOSOption, []testFlow{unicastV4, broadcast}},
+		{"ReceiveTClassOption", tcpip.ReceiveTClassOption, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}},
+	}
+	for _, testCase := range testCases {
+		for _, flow := range testCase.tests {
+			t.Run(fmt.Sprintf("%s:flow:%s", testCase.name, flow), func(t *testing.T) {
+				c := newDualTestContext(t, defaultMTU)
+				defer c.cleanup()
 
-			c.createEndpointForFlow(flow)
+				c.createEndpointForFlow(flow)
+				option := testCase.getReceiveOption
+				name := testCase.name
 
-			// Verify that setting and reading the option works.
-			v, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			// Test for expected default value.
-			if v != false {
-				c.t.Errorf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", v, false)
-			}
+				// Verify that setting and reading the option works.
+				v, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+				}
+				// Test for expected default value.
+				if v != false {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, v, false)
+				}
 
-			want := true
-			if err := c.ep.SetSockOptBool(tcpip.ReceiveTOSOption, want); err != nil {
-				c.t.Fatalf("SetSockOptBool(tcpip.ReceiveTOSOption, %t) failed: %s", want, err)
-			}
+				want := true
+				if err := c.ep.SetSockOptBool(option, want); err != nil {
+					c.t.Fatalf("SetSockOptBool(%s, %t) failed: %s", name, want, err)
+				}
 
-			got, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
-			if err != nil {
-				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
-			}
-			if got != want {
-				c.t.Fatalf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", got, want)
-			}
+				got, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockoptBool(%s) failed: %s", name, err)
+				}
 
-			// Verify that the correct received TOS is handed through as
-			// ancillary data to the ControlMessages struct.
-			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-				c.t.Fatal("Bind failed:", err)
-			}
-			testRead(c, flow, checker.ReceiveTOS(testTOS))
-		})
+				if got != want {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, got, want)
+				}
+
+				// Verify that the correct received TOS or TClass is handed through as
+				// ancillary data to the ControlMessages struct.
+				if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+					c.t.Fatalf("Bind failed: %s", err)
+				}
+				switch option {
+				case tcpip.ReceiveTClassOption:
+					testRead(c, flow, checker.ReceiveTClass(testTOS))
+				case tcpip.ReceiveTOSOption:
+					testRead(c, flow, checker.ReceiveTOS(testTOS))
+				default:
+					t.Fatalf("unknown test variant: %s", name)
+				}
+			})
+		}
 	}
 }
 
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 083ebbcf0..39fd6709d 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -84,20 +84,20 @@ SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type);
 // SocketPairs created with AF_INET and the given type.
 SocketPairKind IPv4UDPUnboundSocketPair(int type);
 
-// IPv4UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_DGRAM, and the given type.
+// IPv4UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_DGRAM, and the given type.
 SocketKind IPv4UDPUnboundSocket(int type);
 
-// IPv6UDPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_DGRAM, and the given type.
+// IPv6UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_DGRAM, and the given type.
 SocketKind IPv6UDPUnboundSocket(int type);
 
-// IPv4TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET, SOCK_STREAM and the given type.
+// IPv4TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_STREAM and the given type.
 SocketKind IPv4TCPUnboundSocket(int type);
 
-// IPv6TCPUnboundSocketPair returns a SocketKind that represents
-// a SimpleSocket created with AF_INET6, SOCK_STREAM and the given type.
+// IPv6TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_STREAM and the given type.
 SocketKind IPv6TCPUnboundSocket(int type);
 
 // IfAddrHelper is a helper class that determines the local interfaces present
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index db5663ecd..1c533fdf2 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -14,6 +14,7 @@
 
 #include "test/syscalls/linux/socket_ip_udp_generic.h"
 
+#include <errno.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <poll.h>
@@ -209,46 +210,6 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
-// Ensure that Receiving TOS is off by default.
-TEST_P(UDPSocketPairTest, RecvTosDefault) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-}
-
-// Test that setting and getting IP_RECVTOS works as expected.
-TEST_P(UDPSocketPairTest, SetRecvTos) {
-  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOff, sizeof(kSockOptOff)),
-              SyscallSucceeds());
-
-  int get = -1;
-  socklen_t get_len = sizeof(get);
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOff);
-
-  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
-                         &kSockOptOn, sizeof(kSockOptOn)),
-              SyscallSucceeds());
-
-  ASSERT_THAT(
-      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
-      SyscallSucceedsWithValue(0));
-  EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, kSockOptOn);
-}
-
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -401,5 +362,97 @@ TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
   EXPECT_EQ(get_len, sizeof(get));
 }
 
+// Holds TOS or TClass information for IPv4 or IPv6 respectively.
+struct RecvTosOption {
+  int level;
+  int option;
+};
+
+RecvTosOption GetRecvTosOption(int domain) {
+  TEST_CHECK(domain == AF_INET || domain == AF_INET6);
+  RecvTosOption opt;
+  switch (domain) {
+    case AF_INET:
+      opt.level = IPPROTO_IP;
+      opt.option = IP_RECVTOS;
+      break;
+    case AF_INET6:
+      opt.level = IPPROTO_IPV6;
+      opt.option = IPV6_RECVTCLASS;
+      break;
+  }
+  return opt;
+}
+
+// Ensure that Receiving TOS or TCLASS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS or IPV6_RECVTCLASS works as
+// expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(GetParam().domain);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOff,
+                         sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
+// Test that any socket (including IPv6 only) accepts the IPv4 TOS option: this
+// mirrors behavior in linux.
+TEST_P(UDPSocketPairTest, TOSRecvMismatch) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+  RecvTosOption t = GetRecvTosOption(AF_INET);
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+}
+
+// Test that an IPv4 socket does not support the IPv6 TClass option.
+TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
+  // This should only test AF_INET sockets for the mismatch behavior.
+  SKIP_IF(GetParam().domain != AF_INET);
+
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+
+  ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IPV6, IPV6_RECVTCLASS,
+                         &get, &get_len),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 9f8de6b48..57b1a357c 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,9 +1349,6 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
-          !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1422,7 +1419,6 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/144868438): IPV6_RECVTCLASS not supported for netstack.
   // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From 55c99ce106e03c419729318947e0be477ed181d0 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Wed, 19 Feb 2020 12:31:43 -0800
Subject: Include more test files in exports_files

So that they can be included by Fuchsia's syscall tests

PiperOrigin-RevId: 296030383
---
 test/syscalls/linux/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e7c82adfc..05a818795 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -12,8 +12,12 @@ exports_files(
         "socket_ip_loopback_blocking.cc",
         "socket_ip_tcp_generic_loopback.cc",
         "socket_ip_tcp_loopback.cc",
+        "socket_ip_tcp_loopback_blocking.cc",
+        "socket_ip_tcp_loopback_nonblock.cc",
         "socket_ip_tcp_udp_generic.cc",
         "socket_ip_udp_loopback.cc",
+        "socket_ip_udp_loopback_blocking.cc",
+        "socket_ip_udp_loopback_nonblock.cc",
         "socket_ip_unbound.cc",
         "socket_ipv4_tcp_unbound_external_networking_test.cc",
         "socket_ipv4_udp_unbound_external_networking_test.cc",
-- 
cgit v1.2.3


From 30794512d3977ebb2b185e5e9cfb969d558a07a4 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 19 Feb 2020 18:20:52 -0800
Subject: Add basic microbenchmarks.

PiperOrigin-RevId: 296104390
---
 WORKSPACE                                  |  10 +
 test/perf/BUILD                            | 114 +++++++
 test/perf/linux/BUILD                      | 356 +++++++++++++++++++++
 test/perf/linux/clock_getres_benchmark.cc  |  39 +++
 test/perf/linux/clock_gettime_benchmark.cc |  60 ++++
 test/perf/linux/death_benchmark.cc         |  36 +++
 test/perf/linux/epoll_benchmark.cc         |  99 ++++++
 test/perf/linux/fork_benchmark.cc          | 350 +++++++++++++++++++++
 test/perf/linux/futex_benchmark.cc         | 248 +++++++++++++++
 test/perf/linux/getdents_benchmark.cc      | 149 +++++++++
 test/perf/linux/getpid_benchmark.cc        |  37 +++
 test/perf/linux/gettid_benchmark.cc        |  38 +++
 test/perf/linux/mapping_benchmark.cc       | 163 ++++++++++
 test/perf/linux/open_benchmark.cc          |  56 ++++
 test/perf/linux/pipe_benchmark.cc          |  66 ++++
 test/perf/linux/randread_benchmark.cc      | 100 ++++++
 test/perf/linux/read_benchmark.cc          |  53 ++++
 test/perf/linux/sched_yield_benchmark.cc   |  37 +++
 test/perf/linux/send_recv_benchmark.cc     | 372 ++++++++++++++++++++++
 test/perf/linux/seqwrite_benchmark.cc      |  66 ++++
 test/perf/linux/signal_benchmark.cc        |  59 ++++
 test/perf/linux/sleep_benchmark.cc         |  60 ++++
 test/perf/linux/stat_benchmark.cc          |  62 ++++
 test/perf/linux/unlink_benchmark.cc        |  66 ++++
 test/perf/linux/write_benchmark.cc         |  52 ++++
 test/runner/BUILD                          |  22 ++
 test/runner/defs.bzl                       | 218 +++++++++++++
 test/runner/gtest/BUILD                    |   9 +
 test/runner/gtest/gtest.go                 | 154 +++++++++
 test/runner/runner.go                      | 477 ++++++++++++++++++++++++++++
 test/syscalls/BUILD                        |  21 +-
 test/syscalls/build_defs.bzl               | 180 -----------
 test/syscalls/gtest/BUILD                  |   9 -
 test/syscalls/gtest/gtest.go               |  93 ------
 test/syscalls/linux/alarm.cc               |   3 +-
 test/syscalls/linux/exec.cc                |   3 +-
 test/syscalls/linux/fcntl.cc               |   2 +-
 test/syscalls/linux/itimer.cc              |   3 +-
 test/syscalls/linux/prctl.cc               |   2 +-
 test/syscalls/linux/prctl_setuid.cc        |   2 +-
 test/syscalls/linux/proc.cc                |   2 +-
 test/syscalls/linux/ptrace.cc              |   2 +-
 test/syscalls/linux/rtsignal.cc            |   3 +-
 test/syscalls/linux/seccomp.cc             |   2 +-
 test/syscalls/linux/sigiret.cc             |   3 +-
 test/syscalls/linux/signalfd.cc            |   2 +-
 test/syscalls/linux/sigstop.cc             |   2 +-
 test/syscalls/linux/sigtimedwait.cc        |   3 +-
 test/syscalls/linux/timers.cc              |   2 +-
 test/syscalls/linux/vfork.cc               |   2 +-
 test/syscalls/syscall_test_runner.go       | 482 -----------------------------
 test/syscalls/syscall_test_runner.sh       |  34 --
 test/util/BUILD                            |   3 +-
 test/util/test_main.cc                     |   2 +-
 test/util/test_util.h                      |   1 +
 test/util/test_util_impl.cc                |  14 +
 tools/bazeldefs/defs.bzl                   |   1 +
 tools/defs.bzl                             |   3 +-
 58 files changed, 3666 insertions(+), 843 deletions(-)
 create mode 100644 test/perf/BUILD
 create mode 100644 test/perf/linux/BUILD
 create mode 100644 test/perf/linux/clock_getres_benchmark.cc
 create mode 100644 test/perf/linux/clock_gettime_benchmark.cc
 create mode 100644 test/perf/linux/death_benchmark.cc
 create mode 100644 test/perf/linux/epoll_benchmark.cc
 create mode 100644 test/perf/linux/fork_benchmark.cc
 create mode 100644 test/perf/linux/futex_benchmark.cc
 create mode 100644 test/perf/linux/getdents_benchmark.cc
 create mode 100644 test/perf/linux/getpid_benchmark.cc
 create mode 100644 test/perf/linux/gettid_benchmark.cc
 create mode 100644 test/perf/linux/mapping_benchmark.cc
 create mode 100644 test/perf/linux/open_benchmark.cc
 create mode 100644 test/perf/linux/pipe_benchmark.cc
 create mode 100644 test/perf/linux/randread_benchmark.cc
 create mode 100644 test/perf/linux/read_benchmark.cc
 create mode 100644 test/perf/linux/sched_yield_benchmark.cc
 create mode 100644 test/perf/linux/send_recv_benchmark.cc
 create mode 100644 test/perf/linux/seqwrite_benchmark.cc
 create mode 100644 test/perf/linux/signal_benchmark.cc
 create mode 100644 test/perf/linux/sleep_benchmark.cc
 create mode 100644 test/perf/linux/stat_benchmark.cc
 create mode 100644 test/perf/linux/unlink_benchmark.cc
 create mode 100644 test/perf/linux/write_benchmark.cc
 create mode 100644 test/runner/BUILD
 create mode 100644 test/runner/defs.bzl
 create mode 100644 test/runner/gtest/BUILD
 create mode 100644 test/runner/gtest/gtest.go
 create mode 100644 test/runner/runner.go
 delete mode 100644 test/syscalls/build_defs.bzl
 delete mode 100644 test/syscalls/gtest/BUILD
 delete mode 100644 test/syscalls/gtest/gtest.go
 delete mode 100644 test/syscalls/syscall_test_runner.go
 delete mode 100755 test/syscalls/syscall_test_runner.sh

(limited to 'test/syscalls/linux')

diff --git a/WORKSPACE b/WORKSPACE
index 2827c3a26..ff0196dc6 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -330,3 +330,13 @@ http_archive(
         "https://github.com/google/googletest/archive/565f1b848215b77c3732bca345fe76a0431d8b34.tar.gz",
     ],
 )
+
+http_archive(
+    name = "com_google_benchmark",
+    sha256 = "3c6a165b6ecc948967a1ead710d4a181d7b0fbcaa183ef7ea84604994966221a",
+    strip_prefix = "benchmark-1.5.0",
+    urls = [
+        "https://mirror.bazel.build/github.com/google/benchmark/archive/v1.5.0.tar.gz",
+        "https://github.com/google/benchmark/archive/v1.5.0.tar.gz",
+    ],
+)
diff --git a/test/perf/BUILD b/test/perf/BUILD
new file mode 100644
index 000000000..7a2bf10ed
--- /dev/null
+++ b/test/perf/BUILD
@@ -0,0 +1,114 @@
+load("//test/runner:defs.bzl", "syscall_test")
+
+package(licenses = ["notice"])
+
+syscall_test(
+    test = "//test/perf/linux:clock_getres_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:clock_gettime_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:death_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:epoll_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:fork_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:futex_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:getdents_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:getpid_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:gettid_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:mapping_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:open_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:pipe_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:randread_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:read_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:sched_yield_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:send_recv_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:seqwrite_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    test = "//test/perf/linux:signal_benchmark",
+)
+
+syscall_test(
+    test = "//test/perf/linux:sleep_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:stat_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:unlink_benchmark",
+)
+
+syscall_test(
+    size = "large",
+    add_overlay = True,
+    test = "//test/perf/linux:write_benchmark",
+)
diff --git a/test/perf/linux/BUILD b/test/perf/linux/BUILD
new file mode 100644
index 000000000..b4e907826
--- /dev/null
+++ b/test/perf/linux/BUILD
@@ -0,0 +1,356 @@
+load("//tools:defs.bzl", "cc_binary", "gbenchmark", "gtest")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+cc_binary(
+    name = "getpid_benchmark",
+    testonly = 1,
+    srcs = [
+        "getpid_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "send_recv_benchmark",
+    testonly = 1,
+    srcs = [
+        "send_recv_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/syscalls/linux:socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_binary(
+    name = "gettid_benchmark",
+    testonly = 1,
+    srcs = [
+        "gettid_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "sched_yield_benchmark",
+    testonly = 1,
+    srcs = [
+        "sched_yield_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "clock_getres_benchmark",
+    testonly = 1,
+    srcs = [
+        "clock_getres_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "clock_gettime_benchmark",
+    testonly = 1,
+    srcs = [
+        "clock_gettime_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:test_main",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "open_benchmark",
+    testonly = 1,
+    srcs = [
+        "open_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "read_benchmark",
+    testonly = 1,
+    srcs = [
+        "read_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "randread_benchmark",
+    testonly = 1,
+    srcs = [
+        "randread_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/random",
+    ],
+)
+
+cc_binary(
+    name = "write_benchmark",
+    testonly = 1,
+    srcs = [
+        "write_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "seqwrite_benchmark",
+    testonly = 1,
+    srcs = [
+        "seqwrite_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/random",
+    ],
+)
+
+cc_binary(
+    name = "pipe_benchmark",
+    testonly = 1,
+    srcs = [
+        "pipe_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+    ],
+)
+
+cc_binary(
+    name = "fork_benchmark",
+    testonly = 1,
+    srcs = [
+        "fork_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:cleanup",
+        "//test/util:file_descriptor",
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_binary(
+    name = "futex_benchmark",
+    testonly = 1,
+    srcs = [
+        "futex_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "epoll_benchmark",
+    testonly = 1,
+    srcs = [
+        "epoll_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:epoll_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "death_benchmark",
+    testonly = 1,
+    srcs = [
+        "death_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "mapping_benchmark",
+    testonly = 1,
+    srcs = [
+        "mapping_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:memory_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "signal_benchmark",
+    testonly = 1,
+    srcs = [
+        "signal_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "getdents_benchmark",
+    testonly = 1,
+    srcs = [
+        "getdents_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "sleep_benchmark",
+    testonly = 1,
+    srcs = [
+        "sleep_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:logging",
+        "//test/util:test_main",
+    ],
+)
+
+cc_binary(
+    name = "stat_benchmark",
+    testonly = 1,
+    srcs = [
+        "stat_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_binary(
+    name = "unlink_benchmark",
+    testonly = 1,
+    srcs = [
+        "unlink_benchmark.cc",
+    ],
+    deps = [
+        gbenchmark,
+        gtest,
+        "//test/util:fs_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
diff --git a/test/perf/linux/clock_getres_benchmark.cc b/test/perf/linux/clock_getres_benchmark.cc
new file mode 100644
index 000000000..b051293ad
--- /dev/null
+++ b/test/perf/linux/clock_getres_benchmark.cc
@@ -0,0 +1,39 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// clock_getres(1) is very nearly a no-op syscall, but it does require copying
+// out to a userspace struct. It thus provides a nice small copy-out benchmark.
+void BM_ClockGetRes(benchmark::State& state) {
+  struct timespec ts;
+  for (auto _ : state) {
+    clock_getres(CLOCK_MONOTONIC, &ts);
+  }
+}
+
+BENCHMARK(BM_ClockGetRes);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/clock_gettime_benchmark.cc b/test/perf/linux/clock_gettime_benchmark.cc
new file mode 100644
index 000000000..6691bebd9
--- /dev/null
+++ b/test/perf/linux/clock_gettime_benchmark.cc
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pthread.h>
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_ClockGettimeThreadCPUTime(benchmark::State& state) {
+  clockid_t clockid;
+  ASSERT_EQ(0, pthread_getcpuclockid(pthread_self(), &clockid));
+  struct timespec tp;
+
+  for (auto _ : state) {
+    clock_gettime(clockid, &tp);
+  }
+}
+
+BENCHMARK(BM_ClockGettimeThreadCPUTime);
+
+void BM_VDSOClockGettime(benchmark::State& state) {
+  const clockid_t clock = state.range(0);
+  struct timespec tp;
+  absl::Time start = absl::Now();
+
+  // Don't benchmark the calibration phase.
+  while (absl::Now() < start + absl::Milliseconds(2100)) {
+    clock_gettime(clock, &tp);
+  }
+
+  for (auto _ : state) {
+    clock_gettime(clock, &tp);
+  }
+}
+
+BENCHMARK(BM_VDSOClockGettime)->Arg(CLOCK_MONOTONIC)->Arg(CLOCK_REALTIME);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/death_benchmark.cc b/test/perf/linux/death_benchmark.cc
new file mode 100644
index 000000000..cb2b6fd07
--- /dev/null
+++ b/test/perf/linux/death_benchmark.cc
@@ -0,0 +1,36 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// DeathTest is not so much a microbenchmark as a macrobenchmark. It is testing
+// the ability of gVisor (on whatever platform) to execute all the related
+// stack-dumping routines associated with EXPECT_EXIT / EXPECT_DEATH.
+TEST(DeathTest, ZeroEqualsOne) {
+  EXPECT_EXIT({ TEST_CHECK(0 == 1); }, ::testing::KilledBySignal(SIGABRT), "");
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/epoll_benchmark.cc b/test/perf/linux/epoll_benchmark.cc
new file mode 100644
index 000000000..0b121338a
--- /dev/null
+++ b/test/perf/linux/epoll_benchmark.cc
@@ -0,0 +1,99 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+
+#include <atomic>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+#include "test/util/epoll_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Returns a new eventfd.
+PosixErrorOr<FileDescriptor> NewEventFD() {
+  int fd = eventfd(0, /* flags = */ 0);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(errno, "eventfd");
+  }
+  return FileDescriptor(fd);
+}
+
+// Also stolen from epoll.cc unit tests.
+void BM_EpollTimeout(benchmark::State& state) {
+  constexpr int kFDsPerEpoll = 3;
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < kFDsPerEpoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(
+        RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN, 0));
+  }
+
+  struct epoll_event result[kFDsPerEpoll];
+  int timeout_ms = state.range(0);
+
+  for (auto _ : state) {
+    EXPECT_EQ(0, epoll_wait(epollfd.get(), result, kFDsPerEpoll, timeout_ms));
+  }
+}
+
+BENCHMARK(BM_EpollTimeout)->Range(0, 8);
+
+// Also stolen from epoll.cc unit tests.
+void BM_EpollAllEvents(benchmark::State& state) {
+  auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+  const int fds_per_epoll = state.range(0);
+  constexpr uint64_t kEventVal = 5;
+
+  std::vector<FileDescriptor> eventfds;
+  for (int i = 0; i < fds_per_epoll; i++) {
+    eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+    ASSERT_NO_ERRNO(
+        RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN, 0));
+
+    ASSERT_THAT(WriteFd(eventfds[i].get(), &kEventVal, sizeof(kEventVal)),
+                SyscallSucceedsWithValue(sizeof(kEventVal)));
+  }
+
+  std::vector<struct epoll_event> result(fds_per_epoll);
+
+  for (auto _ : state) {
+    EXPECT_EQ(fds_per_epoll,
+              epoll_wait(epollfd.get(), result.data(), fds_per_epoll, 0));
+  }
+}
+
+BENCHMARK(BM_EpollAllEvents)->Range(2, 1024);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/fork_benchmark.cc b/test/perf/linux/fork_benchmark.cc
new file mode 100644
index 000000000..84fdbc8a0
--- /dev/null
+++ b/test/perf/linux/fork_benchmark.cc
@@ -0,0 +1,350 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/barrier.h"
+#include "benchmark/benchmark.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kBusyMax = 250;
+
+// Do some CPU-bound busy-work.
+int busy(int max) {
+  // Prevent the compiler from optimizing this work away,
+  volatile int count = 0;
+
+  for (int i = 1; i < max; i++) {
+    for (int j = 2; j < i / 2; j++) {
+      if (i % j == 0) {
+        count++;
+      }
+    }
+  }
+
+  return count;
+}
+
+void BM_CPUBoundUniprocess(benchmark::State& state) {
+  for (auto _ : state) {
+    busy(kBusyMax);
+  }
+}
+
+BENCHMARK(BM_CPUBoundUniprocess);
+
+void BM_CPUBoundAsymmetric(benchmark::State& state) {
+  const size_t max = state.max_iterations;
+  pid_t child = fork();
+  if (child == 0) {
+    for (int i = 0; i < max; i++) {
+      busy(kBusyMax);
+    }
+    _exit(0);
+  }
+  ASSERT_THAT(child, SyscallSucceeds());
+  ASSERT_TRUE(state.KeepRunningBatch(max));
+
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(0, WEXITSTATUS(status));
+  ASSERT_FALSE(state.KeepRunning());
+}
+
+BENCHMARK(BM_CPUBoundAsymmetric)->UseRealTime();
+
+void BM_CPUBoundSymmetric(benchmark::State& state) {
+  std::vector<pid_t> children;
+  auto child_cleanup = Cleanup([&] {
+    for (const pid_t child : children) {
+      int status;
+      EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+      EXPECT_TRUE(WIFEXITED(status));
+      EXPECT_EQ(0, WEXITSTATUS(status));
+    }
+    ASSERT_FALSE(state.KeepRunning());
+  });
+
+  const int processes = state.range(0);
+  for (int i = 0; i < processes; i++) {
+    size_t cur = (state.max_iterations + (processes - 1)) / processes;
+    if ((state.iterations() + cur) >= state.max_iterations) {
+      cur = state.max_iterations - state.iterations();
+    }
+    pid_t child = fork();
+    if (child == 0) {
+      for (int i = 0; i < cur; i++) {
+        busy(kBusyMax);
+      }
+      _exit(0);
+    }
+    ASSERT_THAT(child, SyscallSucceeds());
+    if (cur > 0) {
+      // We can have a zero cur here, depending.
+      ASSERT_TRUE(state.KeepRunningBatch(cur));
+    }
+    children.push_back(child);
+  }
+}
+
+BENCHMARK(BM_CPUBoundSymmetric)->Range(2, 16)->UseRealTime();
+
+// Child routine for ProcessSwitch/ThreadSwitch.
+// Reads from readfd and writes the result to writefd.
+void SwitchChild(int readfd, int writefd) {
+  while (1) {
+    char buf;
+    int ret = ReadFd(readfd, &buf, 1);
+    if (ret == 0) {
+      break;
+    }
+    TEST_CHECK_MSG(ret == 1, "read failed");
+
+    ret = WriteFd(writefd, &buf, 1);
+    if (ret == -1) {
+      TEST_CHECK_MSG(errno == EPIPE, "unexpected write failure");
+      break;
+    }
+    TEST_CHECK_MSG(ret == 1, "write failed");
+  }
+}
+
+// Send bytes in a loop through a series of pipes, each passing through a
+// different process.
+//
+//  Proc 0        Proc 1
+//    * ----------> *
+//    ^   Pipe 1    |
+//    |             |
+//    | Pipe 0      | Pipe 2
+//    |             |
+//    |             |
+//    |   Pipe 3    v
+//    * <---------- *
+//  Proc 3        Proc 2
+//
+// This exercises context switching through multiple processes.
+void BM_ProcessSwitch(benchmark::State& state) {
+  // Code below assumes there are at least two processes.
+  const int num_processes = state.range(0);
+  ASSERT_GE(num_processes, 2);
+
+  std::vector<pid_t> children;
+  auto child_cleanup = Cleanup([&] {
+    for (const pid_t child : children) {
+      int status;
+      EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+      EXPECT_TRUE(WIFEXITED(status));
+      EXPECT_EQ(0, WEXITSTATUS(status));
+    }
+  });
+
+  // Must come after children, as the FDs must be closed before the children
+  // will exit.
+  std::vector<FileDescriptor> read_fds;
+  std::vector<FileDescriptor> write_fds;
+
+  for (int i = 0; i < num_processes; i++) {
+    int fds[2];
+    ASSERT_THAT(pipe(fds), SyscallSucceeds());
+    read_fds.emplace_back(fds[0]);
+    write_fds.emplace_back(fds[1]);
+  }
+
+  // This process is one of the processes in the loop. It will be considered
+  // index 0.
+  for (int i = 1; i < num_processes; i++) {
+    // Read from current pipe index, write to next.
+    const int read_index = i;
+    const int read_fd = read_fds[read_index].get();
+
+    const int write_index = (i + 1) % num_processes;
+    const int write_fd = write_fds[write_index].get();
+
+    // std::vector isn't safe to use from the fork child.
+    FileDescriptor* read_array = read_fds.data();
+    FileDescriptor* write_array = write_fds.data();
+
+    pid_t child = fork();
+    if (!child) {
+      // Close all other FDs.
+      for (int j = 0; j < num_processes; j++) {
+        if (j != read_index) {
+          read_array[j].reset();
+        }
+        if (j != write_index) {
+          write_array[j].reset();
+        }
+      }
+
+      SwitchChild(read_fd, write_fd);
+      _exit(0);
+    }
+    ASSERT_THAT(child, SyscallSucceeds());
+    children.push_back(child);
+  }
+
+  // Read from current pipe index (0), write to next (1).
+  const int read_index = 0;
+  const int read_fd = read_fds[read_index].get();
+
+  const int write_index = 1;
+  const int write_fd = write_fds[write_index].get();
+
+  // Kick start the loop.
+  char buf = 'a';
+  ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+
+  for (auto _ : state) {
+    ASSERT_THAT(ReadFd(read_fd, &buf, 1), SyscallSucceedsWithValue(1));
+    ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+  }
+}
+
+BENCHMARK(BM_ProcessSwitch)->Range(2, 16)->UseRealTime();
+
+// Equivalent to BM_ThreadSwitch using threads instead of processes.
+void BM_ThreadSwitch(benchmark::State& state) {
+  // Code below assumes there are at least two threads.
+  const int num_threads = state.range(0);
+  ASSERT_GE(num_threads, 2);
+
+  // Must come after threads, as the FDs must be closed before the children
+  // will exit.
+  std::vector<std::unique_ptr<ScopedThread>> threads;
+  std::vector<FileDescriptor> read_fds;
+  std::vector<FileDescriptor> write_fds;
+
+  for (int i = 0; i < num_threads; i++) {
+    int fds[2];
+    ASSERT_THAT(pipe(fds), SyscallSucceeds());
+    read_fds.emplace_back(fds[0]);
+    write_fds.emplace_back(fds[1]);
+  }
+
+  // This thread is one of the threads in the loop. It will be considered
+  // index 0.
+  for (int i = 1; i < num_threads; i++) {
+    // Read from current pipe index, write to next.
+    //
+    // Transfer ownership of the FDs to the thread.
+    const int read_index = i;
+    const int read_fd = read_fds[read_index].release();
+
+    const int write_index = (i + 1) % num_threads;
+    const int write_fd = write_fds[write_index].release();
+
+    threads.emplace_back(std::make_unique<ScopedThread>([read_fd, write_fd] {
+      FileDescriptor read(read_fd);
+      FileDescriptor write(write_fd);
+      SwitchChild(read.get(), write.get());
+    }));
+  }
+
+  // Read from current pipe index (0), write to next (1).
+  const int read_index = 0;
+  const int read_fd = read_fds[read_index].get();
+
+  const int write_index = 1;
+  const int write_fd = write_fds[write_index].get();
+
+  // Kick start the loop.
+  char buf = 'a';
+  ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+
+  for (auto _ : state) {
+    ASSERT_THAT(ReadFd(read_fd, &buf, 1), SyscallSucceedsWithValue(1));
+    ASSERT_THAT(WriteFd(write_fd, &buf, 1), SyscallSucceedsWithValue(1));
+  }
+
+  // The two FDs still owned by this thread are closed, causing the next thread
+  // to exit its loop and close its FDs, and so on until all threads exit.
+}
+
+BENCHMARK(BM_ThreadSwitch)->Range(2, 16)->UseRealTime();
+
+void BM_ThreadStart(benchmark::State& state) {
+  const int num_threads = state.range(0);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    auto barrier = new absl::Barrier(num_threads + 1);
+    std::vector<std::unique_ptr<ScopedThread>> threads;
+
+    state.ResumeTiming();
+
+    for (size_t i = 0; i < num_threads; ++i) {
+      threads.emplace_back(std::make_unique<ScopedThread>([barrier] {
+        if (barrier->Block()) {
+          delete barrier;
+        }
+      }));
+    }
+
+    if (barrier->Block()) {
+      delete barrier;
+    }
+
+    state.PauseTiming();
+
+    for (const auto& thread : threads) {
+      thread->Join();
+    }
+
+    state.ResumeTiming();
+  }
+}
+
+BENCHMARK(BM_ThreadStart)->Range(1, 2048)->UseRealTime();
+
+// Benchmark the complete fork + exit + wait.
+void BM_ProcessLifecycle(benchmark::State& state) {
+  const int num_procs = state.range(0);
+
+  std::vector<pid_t> pids(num_procs);
+  for (auto _ : state) {
+    for (size_t i = 0; i < num_procs; ++i) {
+      int pid = fork();
+      if (pid == 0) {
+        _exit(0);
+      }
+      ASSERT_THAT(pid, SyscallSucceeds());
+      pids[i] = pid;
+    }
+
+    for (const int pid : pids) {
+      ASSERT_THAT(RetryEINTR(waitpid)(pid, nullptr, 0),
+                  SyscallSucceedsWithValue(pid));
+    }
+  }
+}
+
+BENCHMARK(BM_ProcessLifecycle)->Range(1, 512)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/futex_benchmark.cc b/test/perf/linux/futex_benchmark.cc
new file mode 100644
index 000000000..b349d50bf
--- /dev/null
+++ b/test/perf/linux/futex_benchmark.cc
@@ -0,0 +1,248 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/futex.h>
+
+#include <atomic>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+inline int FutexWait(std::atomic<int32_t>* v, int32_t val) {
+  return syscall(SYS_futex, v, FUTEX_BITSET_MATCH_ANY, nullptr);
+}
+
+inline int FutexWaitRelativeTimeout(std::atomic<int32_t>* v, int32_t val,
+                                    const struct timespec* reltime) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_PRIVATE, reltime);
+}
+
+inline int FutexWaitAbsoluteTimeout(std::atomic<int32_t>* v, int32_t val,
+                                    const struct timespec* abstime) {
+  return syscall(SYS_futex, v, FUTEX_BITSET_MATCH_ANY, abstime);
+}
+
+inline int FutexWaitBitsetAbsoluteTimeout(std::atomic<int32_t>* v, int32_t val,
+                                          int32_t bits,
+                                          const struct timespec* abstime) {
+  return syscall(SYS_futex, v, FUTEX_WAIT_BITSET_PRIVATE | FUTEX_CLOCK_REALTIME,
+                 val, abstime, nullptr, bits);
+}
+
+inline int FutexWake(std::atomic<int32_t>* v, int32_t count) {
+  return syscall(SYS_futex, v, FUTEX_WAKE_PRIVATE, count);
+}
+
+// This just uses FUTEX_WAKE on an address with nothing waiting, very simple.
+void BM_FutexWakeNop(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+
+  for (auto _ : state) {
+    EXPECT_EQ(0, FutexWake(&v, 1));
+  }
+}
+
+BENCHMARK(BM_FutexWakeNop);
+
+// This just uses FUTEX_WAIT on an address whose value has changed, i.e., the
+// syscall won't wait.
+void BM_FutexWaitNop(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+
+  for (auto _ : state) {
+    EXPECT_EQ(-EAGAIN, FutexWait(&v, 1));
+  }
+}
+
+BENCHMARK(BM_FutexWaitNop);
+
+// This uses FUTEX_WAIT with a timeout on an address whose value never
+// changes, such that it always times out. Timeout overhead can be estimated by
+// timer overruns for short timeouts.
+void BM_FutexWaitTimeout(benchmark::State& state) {
+  const int timeout_ns = state.range(0);
+  std::atomic<int32_t> v(0);
+  auto ts = absl::ToTimespec(absl::Nanoseconds(timeout_ns));
+
+  for (auto _ : state) {
+    EXPECT_EQ(-ETIMEDOUT, FutexWaitRelativeTimeout(&v, 0, &ts));
+  }
+}
+
+BENCHMARK(BM_FutexWaitTimeout)
+    ->Arg(1)
+    ->Arg(10)
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000);
+
+// This calls FUTEX_WAIT_BITSET with CLOCK_REALTIME.
+void BM_FutexWaitBitset(benchmark::State& state) {
+  std::atomic<int32_t> v(0);
+  int timeout_ns = state.range(0);
+  auto ts = absl::ToTimespec(absl::Nanoseconds(timeout_ns));
+  for (auto _ : state) {
+    EXPECT_EQ(-ETIMEDOUT, FutexWaitBitsetAbsoluteTimeout(&v, 0, 1, &ts));
+  }
+}
+
+BENCHMARK(BM_FutexWaitBitset)->Range(0, 100000);
+
+int64_t GetCurrentMonotonicTimeNanos() {
+  struct timespec ts;
+  TEST_CHECK(clock_gettime(CLOCK_MONOTONIC, &ts) != -1);
+  return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+void SpinNanos(int64_t delay_ns) {
+  if (delay_ns <= 0) {
+    return;
+  }
+  const int64_t end = GetCurrentMonotonicTimeNanos() + delay_ns;
+  while (GetCurrentMonotonicTimeNanos() < end) {
+    // spin
+  }
+}
+
+// Each iteration of FutexRoundtripDelayed involves a thread sending a futex
+// wakeup to another thread, which spins for delay_us and then sends a futex
+// wakeup back. The time per iteration is 2*  (delay_us + kBeforeWakeDelayNs +
+// futex/scheduling overhead).
+void BM_FutexRoundtripDelayed(benchmark::State& state) {
+  const int delay_us = state.range(0);
+
+  const int64_t delay_ns = delay_us * 1000;
+  // Spin for an extra kBeforeWakeDelayNs before invoking FUTEX_WAKE to reduce
+  // the probability that the wakeup comes before the wait, preventing the wait
+  // from ever taking effect and causing the benchmark to underestimate the
+  // actual wakeup time.
+  constexpr int64_t kBeforeWakeDelayNs = 500;
+  std::atomic<int32_t> v(0);
+  ScopedThread t([&] {
+    for (int i = 0; i < state.max_iterations; i++) {
+      SpinNanos(delay_ns);
+      while (v.load(std::memory_order_acquire) == 0) {
+        FutexWait(&v, 0);
+      }
+      SpinNanos(kBeforeWakeDelayNs + delay_ns);
+      v.store(0, std::memory_order_release);
+      FutexWake(&v, 1);
+    }
+  });
+  for (auto _ : state) {
+    SpinNanos(kBeforeWakeDelayNs + delay_ns);
+    v.store(1, std::memory_order_release);
+    FutexWake(&v, 1);
+    SpinNanos(delay_ns);
+    while (v.load(std::memory_order_acquire) == 1) {
+      FutexWait(&v, 1);
+    }
+  }
+}
+
+BENCHMARK(BM_FutexRoundtripDelayed)
+    ->Arg(0)
+    ->Arg(10)
+    ->Arg(20)
+    ->Arg(50)
+    ->Arg(100);
+
+// FutexLock is a simple, dumb futex based lock implementation.
+// It will try to acquire the lock by atomically incrementing the
+// lock word. If it did not increment the lock from 0 to 1, someone
+// else has the lock, so it will FUTEX_WAIT until it is woken in
+// the unlock path.
+class FutexLock {
+ public:
+  FutexLock() : lock_word_(0) {}
+
+  void lock(struct timespec* deadline) {
+    int32_t val;
+    while ((val = lock_word_.fetch_add(1, std::memory_order_acquire) + 1) !=
+           1) {
+      // If we didn't get the lock by incrementing from 0 to 1,
+      // do a FUTEX_WAIT with the desired current value set to
+      // val. If val is no longer what the atomic increment returned,
+      // someone might have set it to 0 so we can try to acquire
+      // again.
+      int ret = FutexWaitAbsoluteTimeout(&lock_word_, val, deadline);
+      if (ret == 0 || ret == -EWOULDBLOCK || ret == -EINTR) {
+        continue;
+      } else {
+        FAIL() << "unexpected FUTEX_WAIT return: " << ret;
+      }
+    }
+  }
+
+  void unlock() {
+    // Store 0 into the lock word and wake one waiter. We intentionally
+    // ignore the return value of the FUTEX_WAKE here, since there may be
+    // no waiters to wake anyway.
+    lock_word_.store(0, std::memory_order_release);
+    (void)FutexWake(&lock_word_, 1);
+  }
+
+ private:
+  std::atomic<int32_t> lock_word_;
+};
+
+FutexLock* test_lock;  // Used below.
+
+void FutexContend(benchmark::State& state, int thread_index,
+                  struct timespec* deadline) {
+  int counter = 0;
+  if (thread_index == 0) {
+    test_lock = new FutexLock();
+  }
+  for (auto _ : state) {
+    test_lock->lock(deadline);
+    counter++;
+    test_lock->unlock();
+  }
+  if (thread_index == 0) {
+    delete test_lock;
+  }
+  state.SetItemsProcessed(state.iterations());
+}
+
+void BM_FutexContend(benchmark::State& state) {
+  FutexContend(state, state.thread_index, nullptr);
+}
+
+BENCHMARK(BM_FutexContend)->ThreadRange(1, 1024)->UseRealTime();
+
+void BM_FutexDeadlineContend(benchmark::State& state) {
+  auto deadline = absl::ToTimespec(absl::Now() + absl::Minutes(10));
+  FutexContend(state, state.thread_index, &deadline);
+}
+
+BENCHMARK(BM_FutexDeadlineContend)->ThreadRange(1, 1024)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/getdents_benchmark.cc b/test/perf/linux/getdents_benchmark.cc
new file mode 100644
index 000000000..0e03975b4
--- /dev/null
+++ b/test/perf/linux/getdents_benchmark.cc
@@ -0,0 +1,149 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+#ifndef SYS_getdents64
+#if defined(__x86_64__)
+#define SYS_getdents64 217
+#elif defined(__aarch64__)
+#define SYS_getdents64 217
+#else
+#error "Unknown architecture"
+#endif
+#endif  // SYS_getdents64
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kBufferSize = 16384;
+
+PosixErrorOr<TempPath> CreateDirectory(int count,
+                                       std::vector<std::string>* files) {
+  ASSIGN_OR_RETURN_ERRNO(TempPath dir, TempPath::CreateDir());
+
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor dfd,
+                         Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+  for (int i = 0; i < count; i++) {
+    auto file = NewTempRelPath();
+    auto res = MknodAt(dfd, file, S_IFREG | 0644, 0);
+    RETURN_IF_ERRNO(res);
+    files->push_back(file);
+  }
+
+  return std::move(dir);
+}
+
+PosixError CleanupDirectory(const TempPath& dir,
+                            std::vector<std::string>* files) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor dfd,
+                         Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+  for (auto it = files->begin(); it != files->end(); ++it) {
+    auto res = UnlinkAt(dfd, *it, 0);
+    RETURN_IF_ERRNO(res);
+  }
+  return NoError();
+}
+
+// Creates a directory containing `files` files, and reads all the directory
+// entries from the directory using a single FD.
+void BM_GetdentsSameFD(benchmark::State& state) {
+  // Create directory with given files.
+  const int count = state.range(0);
+
+  // Keep a vector of all of the file TempPaths that is destroyed before dir.
+  //
+  // Normally, we'd simply allow dir to recursively clean up the contained
+  // files, but that recursive cleanup uses getdents, which may be very slow in
+  // extreme benchmarks.
+  TempPath dir;
+  std::vector<std::string> files;
+  dir = ASSERT_NO_ERRNO_AND_VALUE(CreateDirectory(count, &files));
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+  char buffer[kBufferSize];
+
+  // We read all directory entries on each iteration, but report this as a
+  // "batch" iteration so that reported times are per file.
+  while (state.KeepRunningBatch(count)) {
+    ASSERT_THAT(lseek(fd.get(), 0, SEEK_SET), SyscallSucceeds());
+
+    int ret;
+    do {
+      ASSERT_THAT(ret = syscall(SYS_getdents64, fd.get(), buffer, kBufferSize),
+                  SyscallSucceeds());
+    } while (ret > 0);
+  }
+
+  ASSERT_NO_ERRNO(CleanupDirectory(dir, &files));
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_GetdentsSameFD)->Range(1, 1 << 16)->UseRealTime();
+
+// Creates a directory containing `files` files, and reads all the directory
+// entries from the directory using a new FD each time.
+void BM_GetdentsNewFD(benchmark::State& state) {
+  // Create directory with given files.
+  const int count = state.range(0);
+
+  // Keep a vector of all of the file TempPaths that is destroyed before dir.
+  //
+  // Normally, we'd simply allow dir to recursively clean up the contained
+  // files, but that recursive cleanup uses getdents, which may be very slow in
+  // extreme benchmarks.
+  TempPath dir;
+  std::vector<std::string> files;
+  dir = ASSERT_NO_ERRNO_AND_VALUE(CreateDirectory(count, &files));
+  char buffer[kBufferSize];
+
+  // We read all directory entries on each iteration, but report this as a
+  // "batch" iteration so that reported times are per file.
+  while (state.KeepRunningBatch(count)) {
+    FileDescriptor fd =
+        ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+    int ret;
+    do {
+      ASSERT_THAT(ret = syscall(SYS_getdents64, fd.get(), buffer, kBufferSize),
+                  SyscallSucceeds());
+    } while (ret > 0);
+  }
+
+  ASSERT_NO_ERRNO(CleanupDirectory(dir, &files));
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_GetdentsNewFD)->Range(1, 1 << 16)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/getpid_benchmark.cc b/test/perf/linux/getpid_benchmark.cc
new file mode 100644
index 000000000..db74cb264
--- /dev/null
+++ b/test/perf/linux/getpid_benchmark.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Getpid(benchmark::State& state) {
+  for (auto _ : state) {
+    syscall(SYS_getpid);
+  }
+}
+
+BENCHMARK(BM_Getpid);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/gettid_benchmark.cc b/test/perf/linux/gettid_benchmark.cc
new file mode 100644
index 000000000..8f4961f5e
--- /dev/null
+++ b/test/perf/linux/gettid_benchmark.cc
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Gettid(benchmark::State& state) {
+  for (auto _ : state) {
+    syscall(SYS_gettid);
+  }
+}
+
+BENCHMARK(BM_Gettid)->ThreadRange(1, 4000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/mapping_benchmark.cc b/test/perf/linux/mapping_benchmark.cc
new file mode 100644
index 000000000..39c30fe69
--- /dev/null
+++ b/test/perf/linux/mapping_benchmark.cc
@@ -0,0 +1,163 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Conservative value for /proc/sys/vm/max_map_count, which limits the number of
+// VMAs, minus a safety margin for VMAs that already exist for the test binary.
+// The default value for max_map_count is
+// include/linux/mm.h:DEFAULT_MAX_MAP_COUNT = 65530.
+constexpr size_t kMaxVMAs = 64001;
+
+// Map then unmap pages without touching them.
+void BM_MapUnmap(benchmark::State& state) {
+  // Number of pages to map.
+  const int pages = state.range(0);
+
+  while (state.KeepRunning()) {
+    void* addr = mmap(0, pages * kPageSize, PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+    int ret = munmap(addr, pages * kPageSize);
+    TEST_CHECK_MSG(ret == 0, "munmap failed");
+  }
+}
+
+BENCHMARK(BM_MapUnmap)->Range(1, 1 << 17)->UseRealTime();
+
+// Map, touch, then unmap pages.
+void BM_MapTouchUnmap(benchmark::State& state) {
+  // Number of pages to map.
+  const int pages = state.range(0);
+
+  while (state.KeepRunning()) {
+    void* addr = mmap(0, pages * kPageSize, PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+    char* c = reinterpret_cast<char*>(addr);
+    char* end = c + pages * kPageSize;
+    while (c < end) {
+      *c = 42;
+      c += kPageSize;
+    }
+
+    int ret = munmap(addr, pages * kPageSize);
+    TEST_CHECK_MSG(ret == 0, "munmap failed");
+  }
+}
+
+BENCHMARK(BM_MapTouchUnmap)->Range(1, 1 << 17)->UseRealTime();
+
+// Map and touch many pages, unmapping all at once.
+//
+// NOTE(b/111429208): This is a regression test to ensure performant mapping and
+// allocation even with tons of mappings.
+void BM_MapTouchMany(benchmark::State& state) {
+  // Number of pages to map.
+  const int page_count = state.range(0);
+
+  while (state.KeepRunning()) {
+    std::vector<void*> pages;
+
+    for (int i = 0; i < page_count; i++) {
+      void* addr = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE,
+                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      TEST_CHECK_MSG(addr != MAP_FAILED, "mmap failed");
+
+      char* c = reinterpret_cast<char*>(addr);
+      *c = 42;
+
+      pages.push_back(addr);
+    }
+
+    for (void* addr : pages) {
+      int ret = munmap(addr, kPageSize);
+      TEST_CHECK_MSG(ret == 0, "munmap failed");
+    }
+  }
+
+  state.SetBytesProcessed(kPageSize * page_count * state.iterations());
+}
+
+BENCHMARK(BM_MapTouchMany)->Range(1, 1 << 12)->UseRealTime();
+
+void BM_PageFault(benchmark::State& state) {
+  // Map the region in which we will take page faults. To ensure that each page
+  // fault maps only a single page, each page we touch must correspond to a
+  // distinct VMA. Thus we need a 1-page gap between each 1-page VMA. However,
+  // each gap consists of a PROT_NONE VMA, instead of an unmapped hole, so that
+  // if there are background threads running, they can't inadvertently creating
+  // mappings in our gaps that are unmapped when the test ends.
+  size_t test_pages = kMaxVMAs;
+  // Ensure that test_pages is odd, since we want the test region to both
+  // begin and end with a mapped page.
+  if (test_pages % 2 == 0) {
+    test_pages--;
+  }
+  const size_t test_region_bytes = test_pages * kPageSize;
+  // Use MAP_SHARED here because madvise(MADV_DONTNEED) on private mappings on
+  // gVisor won't force future sentry page faults (by design). Use MAP_POPULATE
+  // so that Linux pre-allocates the shmem file used to back the mapping.
+  Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(test_region_bytes, PROT_READ, MAP_SHARED | MAP_POPULATE));
+  for (size_t i = 0; i < test_pages / 2; i++) {
+    ASSERT_THAT(
+        mprotect(reinterpret_cast<void*>(m.addr() + ((2 * i + 1) * kPageSize)),
+                 kPageSize, PROT_NONE),
+        SyscallSucceeds());
+  }
+
+  const size_t mapped_pages = test_pages / 2 + 1;
+  // "Start" at the end of the mapped region to force the mapped region to be
+  // reset, since we mapped it with MAP_POPULATE.
+  size_t cur_page = mapped_pages;
+  for (auto _ : state) {
+    if (cur_page >= mapped_pages) {
+      // We've reached the end of our mapped region and have to reset it to
+      // incur page faults again.
+      state.PauseTiming();
+      ASSERT_THAT(madvise(m.ptr(), test_region_bytes, MADV_DONTNEED),
+                  SyscallSucceeds());
+      cur_page = 0;
+      state.ResumeTiming();
+    }
+    const uintptr_t addr = m.addr() + (2 * cur_page * kPageSize);
+    const char c = *reinterpret_cast<volatile char*>(addr);
+    benchmark::DoNotOptimize(c);
+    cur_page++;
+  }
+}
+
+BENCHMARK(BM_PageFault)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/open_benchmark.cc b/test/perf/linux/open_benchmark.cc
new file mode 100644
index 000000000..68008f6d5
--- /dev/null
+++ b/test/perf/linux/open_benchmark.cc
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Open(benchmark::State& state) {
+  const int size = state.range(0);
+  std::vector<TempPath> cache;
+  for (int i = 0; i < size; i++) {
+    auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+    cache.emplace_back(std::move(path));
+  }
+
+  unsigned int seed = 1;
+  for (auto _ : state) {
+    const int chosen = rand_r(&seed) % size;
+    int fd = open(cache[chosen].path().c_str(), O_RDONLY);
+    TEST_CHECK(fd != -1);
+    close(fd);
+  }
+}
+
+BENCHMARK(BM_Open)->Range(1, 128)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/pipe_benchmark.cc b/test/perf/linux/pipe_benchmark.cc
new file mode 100644
index 000000000..8f5f6a2a3
--- /dev/null
+++ b/test/perf/linux/pipe_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cerrno>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Pipe(benchmark::State& state) {
+  int fds[2];
+  TEST_CHECK(pipe(fds) == 0);
+
+  const int size = state.range(0);
+  std::vector<char> wbuf(size);
+  std::vector<char> rbuf(size);
+  RandomizeBuffer(wbuf.data(), size);
+
+  ScopedThread t([&] {
+    auto const fd = fds[1];
+    for (int i = 0; i < state.max_iterations; i++) {
+      TEST_CHECK(WriteFd(fd, wbuf.data(), wbuf.size()) == size);
+    }
+  });
+
+  for (auto _ : state) {
+    TEST_CHECK(ReadFd(fds[0], rbuf.data(), rbuf.size()) == size);
+  }
+
+  t.Join();
+
+  close(fds[0]);
+  close(fds[1]);
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Pipe)->Range(1, 1 << 20)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/randread_benchmark.cc b/test/perf/linux/randread_benchmark.cc
new file mode 100644
index 000000000..b0eb8c24e
--- /dev/null
+++ b/test/perf/linux/randread_benchmark.cc
@@ -0,0 +1,100 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Create a 1GB file that will be read from at random positions. This should
+// invalid any performance gains from caching.
+const uint64_t kFileSize = 1ULL << 30;
+
+// How many bytes to write at once to initialize the file used to read from.
+const uint32_t kWriteSize = 65536;
+
+// Largest benchmarked read unit.
+const uint32_t kMaxRead = 1UL << 26;
+
+TempPath CreateFile(uint64_t file_size) {
+  auto path = TempPath::CreateFile().ValueOrDie();
+  FileDescriptor fd = Open(path.path(), O_WRONLY).ValueOrDie();
+
+  // Try to minimize syscalls by using maximum size writev() requests.
+  std::vector<char> buffer(kWriteSize);
+  RandomizeBuffer(buffer.data(), buffer.size());
+  const std::vector<std::vector<struct iovec>> iovecs_list =
+      GenerateIovecs(file_size, buffer.data(), buffer.size());
+  for (const auto& iovecs : iovecs_list) {
+    TEST_CHECK(writev(fd.get(), iovecs.data(), iovecs.size()) >= 0);
+  }
+
+  return path;
+}
+
+// Global test state, initialized once per process lifetime.
+struct GlobalState {
+  const TempPath tmpfile;
+  explicit GlobalState(TempPath tfile) : tmpfile(std::move(tfile)) {}
+};
+
+GlobalState& GetGlobalState() {
+  // This gets created only once throughout the lifetime of the process.
+  // Use a dynamically allocated object (that is never deleted) to avoid order
+  // of destruction of static storage variables issues.
+  static GlobalState* const state =
+      // The actual file size is the maximum random seek range (kFileSize) + the
+      // maximum read size so we can read that number of bytes at the end of the
+      // file.
+      new GlobalState(CreateFile(kFileSize + kMaxRead));
+  return *state;
+}
+
+void BM_RandRead(benchmark::State& state) {
+  const int size = state.range(0);
+
+  GlobalState& global_state = GetGlobalState();
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(global_state.tmpfile.path(), O_RDONLY));
+  std::vector<char> buf(size);
+
+  unsigned int seed = 1;
+  for (auto _ : state) {
+    TEST_CHECK(PreadFd(fd.get(), buf.data(), buf.size(),
+                       rand_r(&seed) % kFileSize) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_RandRead)->Range(1, kMaxRead)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/read_benchmark.cc b/test/perf/linux/read_benchmark.cc
new file mode 100644
index 000000000..62445867d
--- /dev/null
+++ b/test/perf/linux/read_benchmark.cc
@@ -0,0 +1,53 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Read(benchmark::State& state) {
+  const int size = state.range(0);
+  const std::string contents(size, 0);
+  auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), contents, TempPath::kDefaultFileMode));
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDONLY));
+
+  std::vector<char> buf(size);
+  for (auto _ : state) {
+    TEST_CHECK(PreadFd(fd.get(), buf.data(), buf.size(), 0) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Read)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/sched_yield_benchmark.cc b/test/perf/linux/sched_yield_benchmark.cc
new file mode 100644
index 000000000..6756b5575
--- /dev/null
+++ b/test/perf/linux/sched_yield_benchmark.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Sched_yield(benchmark::State& state) {
+  for (auto ignored : state) {
+    TEST_CHECK(sched_yield() == 0);
+  }
+}
+
+BENCHMARK(BM_Sched_yield)->ThreadRange(1, 2000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/send_recv_benchmark.cc b/test/perf/linux/send_recv_benchmark.cc
new file mode 100644
index 000000000..d73e49523
--- /dev/null
+++ b/test/perf/linux/send_recv_benchmark.cc
@@ -0,0 +1,372 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include <cstring>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/notification.h"
+#include "benchmark/benchmark.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr ssize_t kMessageSize = 1024;
+
+class Message {
+ public:
+  explicit Message(int byte = 0) : Message(byte, kMessageSize, 0) {}
+
+  explicit Message(int byte, int sz) : Message(byte, sz, 0) {}
+
+  explicit Message(int byte, int sz, int cmsg_sz)
+      : buffer_(sz, byte), cmsg_buffer_(cmsg_sz, 0) {
+    iov_.iov_base = buffer_.data();
+    iov_.iov_len = sz;
+    hdr_.msg_iov = &iov_;
+    hdr_.msg_iovlen = 1;
+    hdr_.msg_control = cmsg_buffer_.data();
+    hdr_.msg_controllen = cmsg_sz;
+  }
+
+  struct msghdr* header() {
+    return &hdr_;
+  }
+
+ private:
+  std::vector<char> buffer_;
+  std::vector<char> cmsg_buffer_;
+  struct iovec iov_ = {};
+  struct msghdr hdr_ = {};
+};
+
+void BM_Recvmsg(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  Message send_msg('a'), recv_msg;
+
+  ScopedThread t([&send_msg, &send_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendmsg(send_socket.get(), send_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_Recvmsg)->UseRealTime();
+
+void BM_Sendmsg(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  Message send_msg('a'), recv_msg;
+
+  ScopedThread t([&recv_msg, &recv_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  for (auto ignored : state) {
+    int n = sendmsg(send_socket.get(), send_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_sent += n;
+  }
+
+  notification.Notify();
+  send_socket.reset();
+
+  state.SetBytesProcessed(bytes_sent);
+}
+
+BENCHMARK(BM_Sendmsg)->UseRealTime();
+
+void BM_Recvfrom(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  char send_buffer[kMessageSize], recv_buffer[kMessageSize];
+
+  ScopedThread t([&send_socket, &send_buffer, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendto(send_socket.get(), send_buffer, kMessageSize, 0, nullptr, 0);
+    }
+  });
+
+  int bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvfrom(recv_socket.get(), recv_buffer, kMessageSize, 0, nullptr,
+                     nullptr);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_Recvfrom)->UseRealTime();
+
+void BM_Sendto(benchmark::State& state) {
+  int sockets[2];
+  TEST_CHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == 0);
+  FileDescriptor send_socket(sockets[0]), recv_socket(sockets[1]);
+  absl::Notification notification;
+  char send_buffer[kMessageSize], recv_buffer[kMessageSize];
+
+  ScopedThread t([&recv_socket, &recv_buffer, &notification] {
+    while (!notification.HasBeenNotified()) {
+      recvfrom(recv_socket.get(), recv_buffer, kMessageSize, 0, nullptr,
+               nullptr);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  for (auto ignored : state) {
+    int n = sendto(send_socket.get(), send_buffer, kMessageSize, 0, nullptr, 0);
+    TEST_CHECK(n > 0);
+    bytes_sent += n;
+  }
+
+  notification.Notify();
+  send_socket.reset();
+
+  state.SetBytesProcessed(bytes_sent);
+}
+
+BENCHMARK(BM_Sendto)->UseRealTime();
+
+PosixErrorOr<sockaddr_storage> InetLoopbackAddr(int family) {
+  struct sockaddr_storage addr;
+  memset(&addr, 0, sizeof(addr));
+  addr.ss_family = family;
+  switch (family) {
+    case AF_INET:
+      reinterpret_cast<struct sockaddr_in*>(&addr)->sin_addr.s_addr =
+          htonl(INADDR_LOOPBACK);
+      break;
+    case AF_INET6:
+      reinterpret_cast<struct sockaddr_in6*>(&addr)->sin6_addr =
+          in6addr_loopback;
+      break;
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+  return addr;
+}
+
+// BM_RecvmsgWithControlBuf measures the performance of recvmsg when we allocate
+// space for control messages. Note that we do not expect to receive any.
+void BM_RecvmsgWithControlBuf(benchmark::State& state) {
+  auto listen_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr = ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(AF_INET6));
+  socklen_t addrlen = sizeof(addr);
+
+  // Bind to some port then start listening.
+  ASSERT_THAT(bind(listen_socket.get(),
+                   reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(listen_socket.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the address we're listening on, then connect to it. We need to do this
+  // because we're allowing the stack to pick a port for us.
+  ASSERT_THAT(getsockname(listen_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  auto send_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(
+      RetryEINTR(connect)(send_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+      SyscallSucceeds());
+
+  // Accept the connection.
+  auto recv_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_socket.get(), nullptr, nullptr));
+
+  absl::Notification notification;
+  Message send_msg('a');
+  // Create a msghdr with a buffer allocated for control messages.
+  Message recv_msg(0, kMessageSize, /*cmsg_sz=*/24);
+
+  ScopedThread t([&send_msg, &send_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      sendmsg(send_socket.get(), send_msg.header(), 0);
+    }
+  });
+
+  int64_t bytes_received = 0;
+  for (auto ignored : state) {
+    int n = recvmsg(recv_socket.get(), recv_msg.header(), 0);
+    TEST_CHECK(n > 0);
+    bytes_received += n;
+  }
+
+  notification.Notify();
+  recv_socket.reset();
+
+  state.SetBytesProcessed(bytes_received);
+}
+
+BENCHMARK(BM_RecvmsgWithControlBuf)->UseRealTime();
+
+// BM_SendmsgTCP measures the sendmsg throughput with varying payload sizes.
+//
+// state.Args[0] indicates whether the underlying socket should be blocking or
+// non-blocking w/ 0 indicating non-blocking and 1 to indicate blocking.
+// state.Args[1] is the size of the payload to be used per sendmsg call.
+void BM_SendmsgTCP(benchmark::State& state) {
+  auto listen_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+
+  // Initialize address to the loopback one.
+  sockaddr_storage addr = ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(AF_INET));
+  socklen_t addrlen = sizeof(addr);
+
+  // Bind to some port then start listening.
+  ASSERT_THAT(bind(listen_socket.get(),
+                   reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(listen_socket.get(), SOMAXCONN), SyscallSucceeds());
+
+  // Get the address we're listening on, then connect to it. We need to do this
+  // because we're allowing the stack to pick a port for us.
+  ASSERT_THAT(getsockname(listen_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+              SyscallSucceeds());
+
+  auto send_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+
+  ASSERT_THAT(
+      RetryEINTR(connect)(send_socket.get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+      SyscallSucceeds());
+
+  // Accept the connection.
+  auto recv_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_socket.get(), nullptr, nullptr));
+
+  // Check if we want to run the test w/ a blocking send socket
+  // or non-blocking.
+  const int blocking = state.range(0);
+  if (!blocking) {
+    // Set the send FD to O_NONBLOCK.
+    int opts;
+    ASSERT_THAT(opts = fcntl(send_socket.get(), F_GETFL), SyscallSucceeds());
+    opts |= O_NONBLOCK;
+    ASSERT_THAT(fcntl(send_socket.get(), F_SETFL, opts), SyscallSucceeds());
+  }
+
+  absl::Notification notification;
+
+  // Get the buffer size we should use for this iteration of the test.
+  const int buf_size = state.range(1);
+  Message send_msg('a', buf_size), recv_msg(0, buf_size);
+
+  ScopedThread t([&recv_msg, &recv_socket, &notification] {
+    while (!notification.HasBeenNotified()) {
+      TEST_CHECK(recvmsg(recv_socket.get(), recv_msg.header(), 0) >= 0);
+    }
+  });
+
+  int64_t bytes_sent = 0;
+  int ncalls = 0;
+  for (auto ignored : state) {
+    int sent = 0;
+    while (true) {
+      struct msghdr hdr = {};
+      struct iovec iov = {};
+      struct msghdr* snd_header = send_msg.header();
+      iov.iov_base = static_cast<char*>(snd_header->msg_iov->iov_base) + sent;
+      iov.iov_len = snd_header->msg_iov->iov_len - sent;
+      hdr.msg_iov = &iov;
+      hdr.msg_iovlen = 1;
+      int n = RetryEINTR(sendmsg)(send_socket.get(), &hdr, 0);
+      ncalls++;
+      if (n > 0) {
+        sent += n;
+        if (sent == buf_size) {
+          break;
+        }
+        // n can be > 0 but less than requested size. In which case we don't
+        // poll.
+        continue;
+      }
+      // Poll the fd for it to become writable.
+      struct pollfd poll_fd = {send_socket.get(), POLL_OUT, 0};
+      EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10),
+                  SyscallSucceedsWithValue(0));
+    }
+    bytes_sent += static_cast<int64_t>(sent);
+  }
+
+  notification.Notify();
+  send_socket.reset();
+  state.SetBytesProcessed(bytes_sent);
+}
+
+void Args(benchmark::internal::Benchmark* benchmark) {
+  for (int blocking = 0; blocking < 2; blocking++) {
+    for (int buf_size = 1024; buf_size <= 256 << 20; buf_size *= 2) {
+      benchmark->Args({blocking, buf_size});
+    }
+  }
+}
+
+BENCHMARK(BM_SendmsgTCP)->Apply(&Args)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/seqwrite_benchmark.cc b/test/perf/linux/seqwrite_benchmark.cc
new file mode 100644
index 000000000..af49e4477
--- /dev/null
+++ b/test/perf/linux/seqwrite_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// The maximum file size of the test file, when writes get beyond this point
+// they wrap around. This should be large enough to blow away caches.
+const uint64_t kMaxFile = 1 << 30;
+
+// Perform writes of various sizes sequentially to one file. Wraps around if it
+// goes above a certain maximum file size.
+void BM_SeqWrite(benchmark::State& state) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_WRONLY));
+
+  const int size = state.range(0);
+  std::vector<char> buf(size);
+  RandomizeBuffer(buf.data(), buf.size());
+
+  // Start writes at offset 0.
+  uint64_t offset = 0;
+  for (auto _ : state) {
+    TEST_CHECK(PwriteFd(fd.get(), buf.data(), buf.size(), offset) ==
+               buf.size());
+    offset += buf.size();
+    // Wrap around if going above the maximum file size.
+    if (offset >= kMaxFile) {
+      offset = 0;
+    }
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_SeqWrite)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/signal_benchmark.cc b/test/perf/linux/signal_benchmark.cc
new file mode 100644
index 000000000..a6928df58
--- /dev/null
+++ b/test/perf/linux/signal_benchmark.cc
@@ -0,0 +1,59 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <string.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void FixupHandler(int sig, siginfo_t* si, void* void_ctx) {
+  static unsigned int dataval = 0;
+
+  // Skip the offending instruction.
+  ucontext_t* ctx = reinterpret_cast<ucontext_t*>(void_ctx);
+  ctx->uc_mcontext.gregs[REG_RAX] = reinterpret_cast<greg_t>(&dataval);
+}
+
+void BM_FaultSignalFixup(benchmark::State& state) {
+  // Set up the signal handler.
+  struct sigaction sa = {};
+  sigemptyset(&sa.sa_mask);
+  sa.sa_sigaction = FixupHandler;
+  sa.sa_flags = SA_SIGINFO;
+  TEST_CHECK(sigaction(SIGSEGV, &sa, nullptr) == 0);
+
+  // Fault, fault, fault.
+  for (auto _ : state) {
+    register volatile unsigned int* ptr asm("rax");
+
+    // Trigger the segfault.
+    ptr = nullptr;
+    *ptr = 0;
+  }
+}
+
+BENCHMARK(BM_FaultSignalFixup)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/sleep_benchmark.cc b/test/perf/linux/sleep_benchmark.cc
new file mode 100644
index 000000000..99ef05117
--- /dev/null
+++ b/test/perf/linux/sleep_benchmark.cc
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Sleep for 'param' nanoseconds.
+void BM_Sleep(benchmark::State& state) {
+  const int nanoseconds = state.range(0);
+
+  for (auto _ : state) {
+    struct timespec ts;
+    ts.tv_sec = 0;
+    ts.tv_nsec = nanoseconds;
+
+    int ret;
+    do {
+      ret = syscall(SYS_nanosleep, &ts, &ts);
+      if (ret < 0) {
+        TEST_CHECK(errno == EINTR);
+      }
+    } while (ret < 0);
+  }
+}
+
+BENCHMARK(BM_Sleep)
+    ->Arg(0)
+    ->Arg(1)
+    ->Arg(1000)              // 1us
+    ->Arg(1000 * 1000)       // 1ms
+    ->Arg(10 * 1000 * 1000)  // 10ms
+    ->Arg(50 * 1000 * 1000)  // 50ms
+    ->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/stat_benchmark.cc b/test/perf/linux/stat_benchmark.cc
new file mode 100644
index 000000000..f15424482
--- /dev/null
+++ b/test/perf/linux/stat_benchmark.cc
@@ -0,0 +1,62 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Creates a file in a nested directory hierarchy at least `depth` directories
+// deep, and stats that file multiple times.
+void BM_Stat(benchmark::State& state) {
+  // Create nested directories with given depth.
+  int depth = state.range(0);
+  const TempPath top_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  std::string dir_path = top_dir.path();
+
+  while (depth-- > 0) {
+    // Don't use TempPath because it will make paths too long to use.
+    //
+    // The top_dir destructor will clean up this whole tree.
+    dir_path = JoinPath(dir_path, absl::StrCat(depth));
+    ASSERT_NO_ERRNO(Mkdir(dir_path, 0755));
+  }
+
+  // Create the file that will be stat'd.
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir_path));
+
+  struct stat st;
+  for (auto _ : state) {
+    ASSERT_THAT(stat(file.path().c_str(), &st), SyscallSucceeds());
+  }
+}
+
+BENCHMARK(BM_Stat)->Range(1, 100)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/unlink_benchmark.cc b/test/perf/linux/unlink_benchmark.cc
new file mode 100644
index 000000000..92243a042
--- /dev/null
+++ b/test/perf/linux/unlink_benchmark.cc
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Creates a directory containing `files` files, and unlinks all the files.
+void BM_Unlink(benchmark::State& state) {
+  // Create directory with given files.
+  const int file_count = state.range(0);
+
+  // We unlink all files on each iteration, but report this as a "batch"
+  // iteration so that reported times are per file.
+  TempPath dir;
+  while (state.KeepRunningBatch(file_count)) {
+    state.PauseTiming();
+    // N.B. dir is declared outside the loop so that destruction of the previous
+    // iteration's directory occurs here, inside of PauseTiming.
+    dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+    std::vector<TempPath> files;
+    for (int i = 0; i < file_count; i++) {
+      TempPath file =
+          ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+      files.push_back(std::move(file));
+    }
+    state.ResumeTiming();
+
+    while (!files.empty()) {
+      // Destructor unlinks.
+      files.pop_back();
+    }
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_Unlink)->Range(1, 100 * 1000)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/perf/linux/write_benchmark.cc b/test/perf/linux/write_benchmark.cc
new file mode 100644
index 000000000..7b060c70e
--- /dev/null
+++ b/test/perf/linux/write_benchmark.cc
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "benchmark/benchmark.h"
+#include "test/util/logging.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void BM_Write(benchmark::State& state) {
+  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_WRONLY));
+
+  const int size = state.range(0);
+  std::vector<char> buf(size);
+  RandomizeBuffer(buf.data(), size);
+
+  for (auto _ : state) {
+    TEST_CHECK(PwriteFd(fd.get(), buf.data(), size, 0) == size);
+  }
+
+  state.SetBytesProcessed(static_cast<int64_t>(size) *
+                          static_cast<int64_t>(state.iterations()));
+}
+
+BENCHMARK(BM_Write)->Range(1, 1 << 26)->UseRealTime();
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/runner/BUILD b/test/runner/BUILD
new file mode 100644
index 000000000..9959ef9b0
--- /dev/null
+++ b/test/runner/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "runner",
+    testonly = 1,
+    srcs = ["runner.go"],
+    data = [
+        "//runsc",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/log",
+        "//runsc/specutils",
+        "//runsc/testutil",
+        "//test/runner/gtest",
+        "//test/uds",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl
new file mode 100644
index 000000000..5e97c1867
--- /dev/null
+++ b/test/runner/defs.bzl
@@ -0,0 +1,218 @@
+"""Defines a rule for syscall test targets."""
+
+load("//tools:defs.bzl", "loopback")
+
+def _runner_test_impl(ctx):
+    # Generate a runner binary.
+    runner = ctx.actions.declare_file("%s-runner" % ctx.label.name)
+    runner_content = "\n".join([
+        "#!/bin/bash",
+        "set -euf -x -o pipefail",
+        "if [[ -n \"${TEST_UNDECLARED_OUTPUTS_DIR}\" ]]; then",
+        "  mkdir -p \"${TEST_UNDECLARED_OUTPUTS_DIR}\"",
+        "  chmod a+rwx \"${TEST_UNDECLARED_OUTPUTS_DIR}\"",
+        "fi",
+        "exec %s %s %s\n" % (
+            ctx.files.runner[0].short_path,
+            " ".join(ctx.attr.runner_args),
+            ctx.files.test[0].short_path,
+        ),
+    ])
+    ctx.actions.write(runner, runner_content, is_executable = True)
+
+    # Return with all transitive files.
+    runfiles = ctx.runfiles(
+        transitive_files = depset(transitive = [
+            depset(target.data_runfiles.files)
+            for target in (ctx.attr.runner, ctx.attr.test)
+            if hasattr(target, "data_runfiles")
+        ]),
+        files = ctx.files.runner + ctx.files.test,
+        collect_default = True,
+        collect_data = True,
+    )
+    return [DefaultInfo(executable = runner, runfiles = runfiles)]
+
+_runner_test = rule(
+    attrs = {
+        "runner": attr.label(
+            default = "//test/runner:runner",
+        ),
+        "test": attr.label(
+            mandatory = True,
+        ),
+        "runner_args": attr.string_list(),
+        "data": attr.label_list(
+            allow_files = True,
+        ),
+    },
+    test = True,
+    implementation = _runner_test_impl,
+)
+
+def _syscall_test(
+        test,
+        shard_count,
+        size,
+        platform,
+        use_tmpfs,
+        tags,
+        network = "none",
+        file_access = "exclusive",
+        overlay = False,
+        add_uds_tree = False):
+    # Prepend "runsc" to non-native platform names.
+    full_platform = platform if platform == "native" else "runsc_" + platform
+
+    # Name the test appropriately.
+    name = test.split(":")[1] + "_" + full_platform
+    if file_access == "shared":
+        name += "_shared"
+    if overlay:
+        name += "_overlay"
+    if network != "none":
+        name += "_" + network + "net"
+
+    # Apply all tags.
+    if tags == None:
+        tags = []
+
+    # Add the full_platform and file access in a tag to make it easier to run
+    # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
+    tags += [full_platform, "file_" + file_access]
+
+    # Hash this target into one of 15 buckets. This can be used to
+    # randomly split targets between different workflows.
+    hash15 = hash(native.package_name() + name) % 15
+    tags.append("hash15:" + str(hash15))
+
+    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
+    # we figure out how to request ipv4 sockets on Guitar machines.
+    if network == "host":
+        tags.append("noguitar")
+
+    # Disable off-host networking.
+    tags.append("requires-net:loopback")
+
+    # Add tag to prevent the tests from running in a Bazel sandbox.
+    # TODO(b/120560048): Make the tests run without this tag.
+    tags.append("no-sandbox")
+
+    # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
+    # more stable.
+    if platform == "kvm":
+        tags.append("manual")
+        tags.append("requires-kvm")
+
+        # TODO(b/112165693): Remove when tests pass reliably.
+        tags.append("notap")
+
+    runner_args = [
+        # Arguments are passed directly to runner binary.
+        "--platform=" + platform,
+        "--network=" + network,
+        "--use-tmpfs=" + str(use_tmpfs),
+        "--file-access=" + file_access,
+        "--overlay=" + str(overlay),
+        "--add-uds-tree=" + str(add_uds_tree),
+    ]
+
+    # Call the rule above.
+    _runner_test(
+        name = name,
+        test = test,
+        runner_args = runner_args,
+        data = [loopback],
+        size = size,
+        tags = tags,
+        shard_count = shard_count,
+    )
+
+def syscall_test(
+        test,
+        shard_count = 5,
+        size = "small",
+        use_tmpfs = False,
+        add_overlay = False,
+        add_uds_tree = False,
+        add_hostinet = False,
+        tags = None):
+    """syscall_test is a macro that will create targets for all platforms.
+
+    Args:
+      test: the test target.
+      shard_count: shards for defined tests.
+      size: the defined test size.
+      use_tmpfs: use tmpfs in the defined tests.
+      add_overlay: add an overlay test.
+      add_uds_tree: add a UDS test.
+      add_hostinet: add a hostinet test.
+      tags: starting test tags.
+    """
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = "native",
+        use_tmpfs = False,
+        add_uds_tree = add_uds_tree,
+        tags = tags,
+    )
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = "kvm",
+        use_tmpfs = use_tmpfs,
+        add_uds_tree = add_uds_tree,
+        tags = tags,
+    )
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = "ptrace",
+        use_tmpfs = use_tmpfs,
+        add_uds_tree = add_uds_tree,
+        tags = tags,
+    )
+
+    if add_overlay:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = False,  # overlay is adding a writable tmpfs on top of root.
+            add_uds_tree = add_uds_tree,
+            tags = tags,
+            overlay = True,
+        )
+
+    if not use_tmpfs:
+        # Also test shared gofer access.
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = use_tmpfs,
+            add_uds_tree = add_uds_tree,
+            tags = tags,
+            file_access = "shared",
+        )
+
+    if add_hostinet:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = "ptrace",
+            use_tmpfs = use_tmpfs,
+            network = "host",
+            add_uds_tree = add_uds_tree,
+            tags = tags,
+        )
diff --git a/test/runner/gtest/BUILD b/test/runner/gtest/BUILD
new file mode 100644
index 000000000..de4b2727c
--- /dev/null
+++ b/test/runner/gtest/BUILD
@@ -0,0 +1,9 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "gtest",
+    srcs = ["gtest.go"],
+    visibility = ["//:sandbox"],
+)
diff --git a/test/runner/gtest/gtest.go b/test/runner/gtest/gtest.go
new file mode 100644
index 000000000..23bf7b5f6
--- /dev/null
+++ b/test/runner/gtest/gtest.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gtest contains helpers for running google-test tests from Go.
+package gtest
+
+import (
+	"fmt"
+	"os/exec"
+	"strings"
+)
+
+var (
+	// listTestFlag is the flag that will list tests in gtest binaries.
+	listTestFlag = "--gtest_list_tests"
+
+	// filterTestFlag is the flag that will filter tests in gtest binaries.
+	filterTestFlag = "--gtest_filter"
+
+	// listBechmarkFlag is the flag that will list benchmarks in gtest binaries.
+	listBenchmarkFlag = "--benchmark_list_tests"
+
+	// filterBenchmarkFlag is the flag that will run specified benchmarks.
+	filterBenchmarkFlag = "--benchmark_filter"
+)
+
+// TestCase is a single gtest test case.
+type TestCase struct {
+	// Suite is the suite for this test.
+	Suite string
+
+	// Name is the name of this individual test.
+	Name string
+
+	// benchmark indicates that this is a benchmark. In this case, the
+	// suite will be empty, and we will use the appropriate test and
+	// benchmark flags.
+	benchmark bool
+}
+
+// FullName returns the name of the test including the suite. It is suitable to
+// pass to "-gtest_filter".
+func (tc TestCase) FullName() string {
+	return fmt.Sprintf("%s.%s", tc.Suite, tc.Name)
+}
+
+// Args returns arguments to be passed when invoking the test.
+func (tc TestCase) Args() []string {
+	if tc.benchmark {
+		return []string{
+			fmt.Sprintf("%s=^$", filterTestFlag),
+			fmt.Sprintf("%s=^%s$", filterBenchmarkFlag, tc.Name),
+		}
+	}
+	return []string{
+		fmt.Sprintf("%s=^%s$", filterTestFlag, tc.FullName()),
+		fmt.Sprintf("%s=^$", filterBenchmarkFlag),
+	}
+}
+
+// ParseTestCases calls a gtest test binary to list its test and returns a
+// slice with the name and suite of each test.
+//
+// If benchmarks is true, then benchmarks will be included in the list of test
+// cases provided. Note that this requires the binary to support the
+// benchmarks_list_tests flag.
+func ParseTestCases(testBin string, benchmarks bool, extraArgs ...string) ([]TestCase, error) {
+	// Run to extract test cases.
+	args := append([]string{listTestFlag}, extraArgs...)
+	cmd := exec.Command(testBin, args...)
+	out, err := cmd.Output()
+	if err != nil {
+		exitErr, ok := err.(*exec.ExitError)
+		if !ok {
+			return nil, fmt.Errorf("could not enumerate gtest tests: %v", err)
+		}
+		return nil, fmt.Errorf("could not enumerate gtest tests: %v\nstderr:\n%s", err, exitErr.Stderr)
+	}
+
+	// Parse test output.
+	var t []TestCase
+	var suite string
+	for _, line := range strings.Split(string(out), "\n") {
+		// Strip comments.
+		line = strings.Split(line, "#")[0]
+
+		// New suite?
+		if !strings.HasPrefix(line, " ") {
+			suite = strings.TrimSuffix(strings.TrimSpace(line), ".")
+			continue
+		}
+
+		// Individual test.
+		name := strings.TrimSpace(line)
+
+		// Do we have a suite yet?
+		if suite == "" {
+			return nil, fmt.Errorf("test without a suite: %v", name)
+		}
+
+		// Add this individual test.
+		t = append(t, TestCase{
+			Suite: suite,
+			Name:  name,
+		})
+
+	}
+
+	// Finished?
+	if !benchmarks {
+		return t, nil
+	}
+
+	// Run again to extract benchmarks.
+	args = append([]string{listBenchmarkFlag}, extraArgs...)
+	cmd = exec.Command(testBin, args...)
+	out, err = cmd.Output()
+	if err != nil {
+		exitErr, ok := err.(*exec.ExitError)
+		if !ok {
+			return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v", err)
+		}
+		return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v\nstderr\n%s", err, exitErr.Stderr)
+	}
+
+	// Parse benchmark output.
+	for _, line := range strings.Split(string(out), "\n") {
+		// Strip comments.
+		line = strings.Split(line, "#")[0]
+
+		// Single benchmark.
+		name := strings.TrimSpace(line)
+
+		// Add the single benchmark.
+		t = append(t, TestCase{
+			Suite:     "Benchmarks",
+			Name:      name,
+			benchmark: true,
+		})
+	}
+
+	return t, nil
+}
diff --git a/test/runner/runner.go b/test/runner/runner.go
new file mode 100644
index 000000000..a78ef38e0
--- /dev/null
+++ b/test/runner/runner.go
@@ -0,0 +1,477 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary syscall_test_runner runs the syscall test suites in gVisor
+// containers and on the host platform.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"os/signal"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/specutils"
+	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/test/runner/gtest"
+	"gvisor.dev/gvisor/test/uds"
+)
+
+var (
+	debug      = flag.Bool("debug", false, "enable debug logs")
+	strace     = flag.Bool("strace", false, "enable strace logs")
+	platform   = flag.String("platform", "ptrace", "platform to run on")
+	network    = flag.String("network", "none", "network stack to run on (sandbox, host, none)")
+	useTmpfs   = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
+	fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
+	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
+	parallel   = flag.Bool("parallel", false, "run tests in parallel")
+	runscPath  = flag.String("runsc", "", "path to runsc binary")
+
+	addUDSTree = flag.Bool("add-uds-tree", false, "expose a tree of UDS utilities for use in tests")
+)
+
+// runTestCaseNative runs the test case directly on the host machine.
+func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
+	// These tests might be running in parallel, so make sure they have a
+	// unique test temp dir.
+	tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
+	if err != nil {
+		t.Fatalf("could not create temp dir: %v", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	// Replace TEST_TMPDIR in the current environment with something
+	// unique.
+	env := os.Environ()
+	newEnvVar := "TEST_TMPDIR=" + tmpDir
+	var found bool
+	for i, kv := range env {
+		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
+			env[i] = newEnvVar
+			found = true
+			break
+		}
+	}
+	if !found {
+		env = append(env, newEnvVar)
+	}
+	// Remove env variables that cause the gunit binary to write output
+	// files, since they will stomp on eachother, and on the output files
+	// from this go test.
+	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
+
+	// Remove shard env variables so that the gunit binary does not try to
+	// intepret them.
+	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
+
+	if *addUDSTree {
+		socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
+		if err != nil {
+			t.Fatalf("failed to create socket tree: %v", err)
+		}
+		defer cleanup()
+
+		env = append(env, "TEST_UDS_TREE="+socketDir)
+		// On Linux, the concept of "attach" location doesn't exist.
+		// Just pass the same path to make these test identical.
+		env = append(env, "TEST_UDS_ATTACH_TREE="+socketDir)
+	}
+
+	cmd := exec.Command(testBin, tc.Args()...)
+	cmd.Env = env
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		ws := err.(*exec.ExitError).Sys().(syscall.WaitStatus)
+		t.Errorf("test %q exited with status %d, want 0", tc.FullName(), ws.ExitStatus())
+	}
+}
+
+// runRunsc runs spec in runsc in a standard test configuration.
+//
+// runsc logs will be saved to a path in TEST_UNDECLARED_OUTPUTS_DIR.
+//
+// Returns an error if the sandboxed application exits non-zero.
+func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
+	bundleDir, err := testutil.SetupBundleDir(spec)
+	if err != nil {
+		return fmt.Errorf("SetupBundleDir failed: %v", err)
+	}
+	defer os.RemoveAll(bundleDir)
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		return fmt.Errorf("SetupRootDir failed: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	name := tc.FullName()
+	id := testutil.UniqueContainerID()
+	log.Infof("Running test %q in container %q", name, id)
+	specutils.LogSpec(spec)
+
+	args := []string{
+		"-root", rootDir,
+		"-network", *network,
+		"-log-format=text",
+		"-TESTONLY-unsafe-nonroot=true",
+		"-net-raw=true",
+		fmt.Sprintf("-panic-signal=%d", syscall.SIGTERM),
+		"-watchdog-action=panic",
+		"-platform", *platform,
+		"-file-access", *fileAccess,
+	}
+	if *overlay {
+		args = append(args, "-overlay")
+	}
+	if *debug {
+		args = append(args, "-debug", "-log-packets=true")
+	}
+	if *strace {
+		args = append(args, "-strace")
+	}
+	if *addUDSTree {
+		args = append(args, "-fsgofer-host-uds")
+	}
+
+	if outDir, ok := syscall.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
+		tdir := filepath.Join(outDir, strings.Replace(name, "/", "_", -1))
+		if err := os.MkdirAll(tdir, 0755); err != nil {
+			return fmt.Errorf("could not create test dir: %v", err)
+		}
+		debugLogDir, err := ioutil.TempDir(tdir, "runsc")
+		if err != nil {
+			return fmt.Errorf("could not create temp dir: %v", err)
+		}
+		debugLogDir += "/"
+		log.Infof("runsc logs: %s", debugLogDir)
+		args = append(args, "-debug-log", debugLogDir)
+
+		// Default -log sends messages to stderr which makes reading the test log
+		// difficult. Instead, drop them when debug log is enabled given it's a
+		// better place for these messages.
+		args = append(args, "-log=/dev/null")
+	}
+
+	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
+	// as root inside that namespace to get it.
+	rArgs := append(args, "run", "--bundle", bundleDir, id)
+	cmd := exec.Command(*runscPath, rArgs...)
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
+		// Set current user/group as root inside the namespace.
+		UidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
+		},
+		GidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
+		},
+		GidMappingsEnableSetgroups: false,
+		Credential: &syscall.Credential{
+			Uid: 0,
+			Gid: 0,
+		},
+	}
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	sig := make(chan os.Signal, 1)
+	signal.Notify(sig, syscall.SIGTERM)
+	go func() {
+		s, ok := <-sig
+		if !ok {
+			return
+		}
+		log.Warningf("%s: Got signal: %v", name, s)
+		done := make(chan bool)
+		dArgs := append([]string{}, args...)
+		dArgs = append(dArgs, "-alsologtostderr=true", "debug", "--stacks", id)
+		go func(dArgs []string) {
+			cmd := exec.Command(*runscPath, dArgs...)
+			cmd.Stdout = os.Stdout
+			cmd.Stderr = os.Stderr
+			cmd.Run()
+			done <- true
+		}(dArgs)
+
+		timeout := time.After(3 * time.Second)
+		select {
+		case <-timeout:
+			log.Infof("runsc debug --stacks is timeouted")
+		case <-done:
+		}
+
+		log.Warningf("Send SIGTERM to the sandbox process")
+		dArgs = append(args, "debug",
+			fmt.Sprintf("--signal=%d", syscall.SIGTERM),
+			id)
+		cmd := exec.Command(*runscPath, dArgs...)
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+		cmd.Run()
+	}()
+
+	err = cmd.Run()
+
+	signal.Stop(sig)
+	close(sig)
+
+	return err
+}
+
+// setupUDSTree updates the spec to expose a UDS tree for gofer socket testing.
+func setupUDSTree(spec *specs.Spec) (cleanup func(), err error) {
+	socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
+	if err != nil {
+		return nil, fmt.Errorf("failed to create socket tree: %v", err)
+	}
+
+	// Standard access to entire tree.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets",
+		Source:      socketDir,
+		Type:        "bind",
+	})
+
+	// Individial attach points for each socket to test mounts that attach
+	// directly to the sockets.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/stream/echo",
+		Source:      filepath.Join(socketDir, "stream/echo"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/stream/nonlistening",
+		Source:      filepath.Join(socketDir, "stream/nonlistening"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/seqpacket/echo",
+		Source:      filepath.Join(socketDir, "seqpacket/echo"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/seqpacket/nonlistening",
+		Source:      filepath.Join(socketDir, "seqpacket/nonlistening"),
+		Type:        "bind",
+	})
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: "/tmp/sockets-attach/dgram/null",
+		Source:      filepath.Join(socketDir, "dgram/null"),
+		Type:        "bind",
+	})
+
+	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_TREE=/tmp/sockets")
+	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_ATTACH_TREE=/tmp/sockets-attach")
+
+	return cleanup, nil
+}
+
+// runsTestCaseRunsc runs the test case in runsc.
+func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
+	// Run a new container with the test executable and filter for the
+	// given test suite and name.
+	spec := testutil.NewSpecWithArgs(append([]string{testBin}, tc.Args()...)...)
+
+	// Mark the root as writeable, as some tests attempt to
+	// write to the rootfs, and expect EACCES, not EROFS.
+	spec.Root.Readonly = false
+
+	// Test spec comes with pre-defined mounts that we don't want. Reset it.
+	spec.Mounts = nil
+	if *useTmpfs {
+		// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on
+		// features only available in gVisor's internal tmpfs may fail.
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: "/tmp",
+			Type:        "tmpfs",
+		})
+	} else {
+		// Use a gofer-backed directory as '/tmp'.
+		//
+		// Tests might be running in parallel, so make sure each has a
+		// unique test temp dir.
+		//
+		// Some tests (e.g., sticky) access this mount from other
+		// users, so make sure it is world-accessible.
+		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
+		if err != nil {
+			t.Fatalf("could not create temp dir: %v", err)
+		}
+		defer os.RemoveAll(tmpDir)
+
+		if err := os.Chmod(tmpDir, 0777); err != nil {
+			t.Fatalf("could not chmod temp dir: %v", err)
+		}
+
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Type:        "bind",
+			Destination: "/tmp",
+			Source:      tmpDir,
+		})
+	}
+
+	// Set environment variables that indicate we are
+	// running in gVisor with the given platform and network.
+	platformVar := "TEST_ON_GVISOR"
+	networkVar := "GVISOR_NETWORK"
+	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network)
+
+	// Remove env variables that cause the gunit binary to write output
+	// files, since they will stomp on eachother, and on the output files
+	// from this go test.
+	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
+
+	// Remove shard env variables so that the gunit binary does not try to
+	// intepret them.
+	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
+
+	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
+	// be backed by tmpfs.
+	for i, kv := range env {
+		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
+			env[i] = "TEST_TMPDIR=/tmp"
+			break
+		}
+	}
+
+	spec.Process.Env = env
+
+	if *addUDSTree {
+		cleanup, err := setupUDSTree(spec)
+		if err != nil {
+			t.Fatalf("error creating UDS tree: %v", err)
+		}
+		defer cleanup()
+	}
+
+	if err := runRunsc(tc, spec); err != nil {
+		t.Errorf("test %q failed with error %v, want nil", tc.FullName(), err)
+	}
+}
+
+// filterEnv returns an environment with the blacklisted variables removed.
+func filterEnv(env, blacklist []string) []string {
+	var out []string
+	for _, kv := range env {
+		ok := true
+		for _, k := range blacklist {
+			if strings.HasPrefix(kv, k+"=") {
+				ok = false
+				break
+			}
+		}
+		if ok {
+			out = append(out, kv)
+		}
+	}
+	return out
+}
+
+func fatalf(s string, args ...interface{}) {
+	fmt.Fprintf(os.Stderr, s+"\n", args...)
+	os.Exit(1)
+}
+
+func matchString(a, b string) (bool, error) {
+	return a == b, nil
+}
+
+func main() {
+	flag.Parse()
+	if flag.NArg() != 1 {
+		fatalf("test must be provided")
+	}
+	testBin := flag.Args()[0] // Only argument.
+
+	log.SetLevel(log.Info)
+	if *debug {
+		log.SetLevel(log.Debug)
+	}
+
+	if *platform != "native" && *runscPath == "" {
+		if err := testutil.ConfigureExePath(); err != nil {
+			panic(err.Error())
+		}
+		*runscPath = specutils.ExePath
+	}
+
+	// Make sure stdout and stderr are opened with O_APPEND, otherwise logs
+	// from outside the sandbox can (and will) stomp on logs from inside
+	// the sandbox.
+	for _, f := range []*os.File{os.Stdout, os.Stderr} {
+		flags, err := unix.FcntlInt(f.Fd(), unix.F_GETFL, 0)
+		if err != nil {
+			fatalf("error getting file flags for %v: %v", f, err)
+		}
+		if flags&unix.O_APPEND == 0 {
+			flags |= unix.O_APPEND
+			if _, err := unix.FcntlInt(f.Fd(), unix.F_SETFL, flags); err != nil {
+				fatalf("error setting file flags for %v: %v", f, err)
+			}
+		}
+	}
+
+	// Get all test cases in each binary.
+	testCases, err := gtest.ParseTestCases(testBin, true)
+	if err != nil {
+		fatalf("ParseTestCases(%q) failed: %v", testBin, err)
+	}
+
+	// Get subset of tests corresponding to shard.
+	indices, err := testutil.TestIndicesForShard(len(testCases))
+	if err != nil {
+		fatalf("TestsForShard() failed: %v", err)
+	}
+
+	// Resolve the absolute path for the binary.
+	testBin, err = filepath.Abs(testBin)
+	if err != nil {
+		fatalf("Abs() failed: %v", err)
+	}
+
+	// Run the tests.
+	var tests []testing.InternalTest
+	for _, tci := range indices {
+		// Capture tc.
+		tc := testCases[tci]
+		tests = append(tests, testing.InternalTest{
+			Name: fmt.Sprintf("%s_%s", tc.Suite, tc.Name),
+			F: func(t *testing.T) {
+				if *parallel {
+					t.Parallel()
+				}
+				if *platform == "native" {
+					// Run the test case on host.
+					runTestCaseNative(testBin, tc, t)
+				} else {
+					// Run the test case in runsc.
+					runTestCaseRunsc(testBin, tc, t)
+				}
+			},
+		})
+	}
+
+	testing.Main(matchString, tests, nil, nil)
+}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 31d239c0e..d69ac8356 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -1,5 +1,4 @@
-load("//tools:defs.bzl", "go_binary")
-load("//test/syscalls:build_defs.bzl", "syscall_test")
+load("//test/runner:defs.bzl", "syscall_test")
 
 package(licenses = ["notice"])
 
@@ -726,21 +725,3 @@ syscall_test(test = "//test/syscalls/linux:proc_net_unix_test")
 syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
 
 syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
-
-go_binary(
-    name = "syscall_test_runner",
-    testonly = 1,
-    srcs = ["syscall_test_runner.go"],
-    data = [
-        "//runsc",
-    ],
-    deps = [
-        "//pkg/log",
-        "//runsc/specutils",
-        "//runsc/testutil",
-        "//test/syscalls/gtest",
-        "//test/uds",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-        "@org_golang_x_sys//unix:go_default_library",
-    ],
-)
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
deleted file mode 100644
index cbab85ef7..000000000
--- a/test/syscalls/build_defs.bzl
+++ /dev/null
@@ -1,180 +0,0 @@
-"""Defines a rule for syscall test targets."""
-
-load("//tools:defs.bzl", "loopback")
-
-def syscall_test(
-        test,
-        shard_count = 5,
-        size = "small",
-        use_tmpfs = False,
-        add_overlay = False,
-        add_uds_tree = False,
-        add_hostinet = False,
-        tags = None):
-    """syscall_test is a macro that will create targets for all platforms.
-
-    Args:
-      test: the test target.
-      shard_count: shards for defined tests.
-      size: the defined test size.
-      use_tmpfs: use tmpfs in the defined tests.
-      add_overlay: add an overlay test.
-      add_uds_tree: add a UDS test.
-      add_hostinet: add a hostinet test.
-      tags: starting test tags.
-    """
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "native",
-        use_tmpfs = False,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "kvm",
-        use_tmpfs = use_tmpfs,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    _syscall_test(
-        test = test,
-        shard_count = shard_count,
-        size = size,
-        platform = "ptrace",
-        use_tmpfs = use_tmpfs,
-        add_uds_tree = add_uds_tree,
-        tags = tags,
-    )
-
-    if add_overlay:
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = False,  # overlay is adding a writable tmpfs on top of root.
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-            overlay = True,
-        )
-
-    if not use_tmpfs:
-        # Also test shared gofer access.
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = use_tmpfs,
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-            file_access = "shared",
-        )
-
-    if add_hostinet:
-        _syscall_test(
-            test = test,
-            shard_count = shard_count,
-            size = size,
-            platform = "ptrace",
-            use_tmpfs = use_tmpfs,
-            network = "host",
-            add_uds_tree = add_uds_tree,
-            tags = tags,
-        )
-
-def _syscall_test(
-        test,
-        shard_count,
-        size,
-        platform,
-        use_tmpfs,
-        tags,
-        network = "none",
-        file_access = "exclusive",
-        overlay = False,
-        add_uds_tree = False):
-    test_name = test.split(":")[1]
-
-    # Prepend "runsc" to non-native platform names.
-    full_platform = platform if platform == "native" else "runsc_" + platform
-
-    name = test_name + "_" + full_platform
-    if file_access == "shared":
-        name += "_shared"
-    if overlay:
-        name += "_overlay"
-    if network != "none":
-        name += "_" + network + "net"
-
-    if tags == None:
-        tags = []
-
-    # Add the full_platform and file access in a tag to make it easier to run
-    # all the tests on a specific flavor. Use --test_tag_filters=ptrace,file_shared.
-    tags += [full_platform, "file_" + file_access]
-
-    # Hash this target into one of 15 buckets. This can be used to
-    # randomly split targets between different workflows.
-    hash15 = hash(native.package_name() + name) % 15
-    tags.append("hash15:" + str(hash15))
-
-    # TODO(b/139838000): Tests using hostinet must be disabled on Guitar until
-    # we figure out how to request ipv4 sockets on Guitar machines.
-    if network == "host":
-        tags.append("noguitar")
-
-    # Disable off-host networking.
-    tags.append("requires-net:loopback")
-
-    # Add tag to prevent the tests from running in a Bazel sandbox.
-    # TODO(b/120560048): Make the tests run without this tag.
-    tags.append("no-sandbox")
-
-    # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
-    # more stable.
-    if platform == "kvm":
-        tags.append("manual")
-        tags.append("requires-kvm")
-
-        # TODO(b/112165693): Remove when tests pass reliably.
-        tags.append("notap")
-
-    args = [
-        # Arguments are passed directly to syscall_test_runner binary.
-        "--test-name=" + test_name,
-        "--platform=" + platform,
-        "--network=" + network,
-        "--use-tmpfs=" + str(use_tmpfs),
-        "--file-access=" + file_access,
-        "--overlay=" + str(overlay),
-        "--add-uds-tree=" + str(add_uds_tree),
-    ]
-
-    sh_test(
-        srcs = ["syscall_test_runner.sh"],
-        name = name,
-        data = [
-            ":syscall_test_runner",
-            loopback,
-            test,
-        ],
-        args = args,
-        size = size,
-        tags = tags,
-        shard_count = shard_count,
-    )
-
-def sh_test(**kwargs):
-    """Wraps the standard sh_test."""
-    native.sh_test(
-        **kwargs
-    )
diff --git a/test/syscalls/gtest/BUILD b/test/syscalls/gtest/BUILD
deleted file mode 100644
index de4b2727c..000000000
--- a/test/syscalls/gtest/BUILD
+++ /dev/null
@@ -1,9 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "gtest",
-    srcs = ["gtest.go"],
-    visibility = ["//:sandbox"],
-)
diff --git a/test/syscalls/gtest/gtest.go b/test/syscalls/gtest/gtest.go
deleted file mode 100644
index bdec8eb07..000000000
--- a/test/syscalls/gtest/gtest.go
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package gtest contains helpers for running google-test tests from Go.
-package gtest
-
-import (
-	"fmt"
-	"os/exec"
-	"strings"
-)
-
-var (
-	// ListTestFlag is the flag that will list tests in gtest binaries.
-	ListTestFlag = "--gtest_list_tests"
-
-	// FilterTestFlag is the flag that will filter tests in gtest binaries.
-	FilterTestFlag = "--gtest_filter"
-)
-
-// TestCase is a single gtest test case.
-type TestCase struct {
-	// Suite is the suite for this test.
-	Suite string
-
-	// Name is the name of this individual test.
-	Name string
-}
-
-// FullName returns the name of the test including the suite. It is suitable to
-// pass to "-gtest_filter".
-func (tc TestCase) FullName() string {
-	return fmt.Sprintf("%s.%s", tc.Suite, tc.Name)
-}
-
-// ParseTestCases calls a gtest test binary to list its test and returns a
-// slice with the name and suite of each test.
-func ParseTestCases(testBin string, extraArgs ...string) ([]TestCase, error) {
-	args := append([]string{ListTestFlag}, extraArgs...)
-	cmd := exec.Command(testBin, args...)
-	out, err := cmd.Output()
-	if err != nil {
-		exitErr, ok := err.(*exec.ExitError)
-		if !ok {
-			return nil, fmt.Errorf("could not enumerate gtest tests: %v", err)
-		}
-		return nil, fmt.Errorf("could not enumerate gtest tests: %v\nstderr:\n%s", err, exitErr.Stderr)
-	}
-
-	var t []TestCase
-	var suite string
-	for _, line := range strings.Split(string(out), "\n") {
-		// Strip comments.
-		line = strings.Split(line, "#")[0]
-
-		// New suite?
-		if !strings.HasPrefix(line, " ") {
-			suite = strings.TrimSuffix(strings.TrimSpace(line), ".")
-			continue
-		}
-
-		// Individual test.
-		name := strings.TrimSpace(line)
-
-		// Do we have a suite yet?
-		if suite == "" {
-			return nil, fmt.Errorf("test without a suite: %v", name)
-		}
-
-		// Add this individual test.
-		t = append(t, TestCase{
-			Suite: suite,
-			Name:  name,
-		})
-
-	}
-
-	if len(t) == 0 {
-		return nil, fmt.Errorf("no tests parsed from %v", testBin)
-	}
-	return t, nil
-}
diff --git a/test/syscalls/linux/alarm.cc b/test/syscalls/linux/alarm.cc
index d89269985..940c97285 100644
--- a/test/syscalls/linux/alarm.cc
+++ b/test/syscalls/linux/alarm.cc
@@ -188,6 +188,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index b5e0a512b..07bd527e6 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -868,6 +868,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 421c15b87..c7cc5816e 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -1128,5 +1128,5 @@ int main(int argc, char** argv) {
     exit(err);
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index b77e4cbd1..8b48f0804 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -349,6 +349,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
index d07571a5f..04c5161f5 100644
--- a/test/syscalls/linux/prctl.cc
+++ b/test/syscalls/linux/prctl.cc
@@ -226,5 +226,5 @@ int main(int argc, char** argv) {
          prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0));
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/prctl_setuid.cc b/test/syscalls/linux/prctl_setuid.cc
index 30f0d75b3..c4e9cf528 100644
--- a/test/syscalls/linux/prctl_setuid.cc
+++ b/test/syscalls/linux/prctl_setuid.cc
@@ -264,5 +264,5 @@ int main(int argc, char** argv) {
            prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0);
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index a23fdb58d..f91187e75 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -2076,5 +2076,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 4dd5cf27b..bfe3e2603 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -1208,5 +1208,5 @@ int main(int argc, char** argv) {
     gvisor::testing::RunExecveChild();
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/rtsignal.cc b/test/syscalls/linux/rtsignal.cc
index 81d193ffd..ed27e2566 100644
--- a/test/syscalls/linux/rtsignal.cc
+++ b/test/syscalls/linux/rtsignal.cc
@@ -167,6 +167,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 2c947feb7..cf6499f8b 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -411,5 +411,5 @@ int main(int argc, char** argv) {
   }
 
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
index 4deb1ae95..6227774a4 100644
--- a/test/syscalls/linux/sigiret.cc
+++ b/test/syscalls/linux/sigiret.cc
@@ -132,6 +132,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc
index 95be4b66c..389e5fca2 100644
--- a/test/syscalls/linux/signalfd.cc
+++ b/test/syscalls/linux/signalfd.cc
@@ -369,5 +369,5 @@ int main(int argc, char** argv) {
 
   gvisor::testing::TestInit(&argc, &argv);
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sigstop.cc b/test/syscalls/linux/sigstop.cc
index 7db57d968..b2fcedd62 100644
--- a/test/syscalls/linux/sigstop.cc
+++ b/test/syscalls/linux/sigstop.cc
@@ -147,5 +147,5 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/sigtimedwait.cc b/test/syscalls/linux/sigtimedwait.cc
index 1e5bf5942..4f8afff15 100644
--- a/test/syscalls/linux/sigtimedwait.cc
+++ b/test/syscalls/linux/sigtimedwait.cc
@@ -319,6 +319,5 @@ int main(int argc, char** argv) {
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
   gvisor::testing::TestInit(&argc, &argv);
-
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
index 2f92c27da..4b3c44527 100644
--- a/test/syscalls/linux/timers.cc
+++ b/test/syscalls/linux/timers.cc
@@ -658,5 +658,5 @@ int main(int argc, char** argv) {
     }
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/linux/vfork.cc b/test/syscalls/linux/vfork.cc
index 0aaba482d..19d05998e 100644
--- a/test/syscalls/linux/vfork.cc
+++ b/test/syscalls/linux/vfork.cc
@@ -191,5 +191,5 @@ int main(int argc, char** argv) {
     return gvisor::testing::RunChild();
   }
 
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
deleted file mode 100644
index ae342b68c..000000000
--- a/test/syscalls/syscall_test_runner.go
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Binary syscall_test_runner runs the syscall test suites in gVisor
-// containers and on the host platform.
-package main
-
-import (
-	"flag"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"os/exec"
-	"os/signal"
-	"path/filepath"
-	"strings"
-	"syscall"
-	"testing"
-	"time"
-
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/specutils"
-	"gvisor.dev/gvisor/runsc/testutil"
-	"gvisor.dev/gvisor/test/syscalls/gtest"
-	"gvisor.dev/gvisor/test/uds"
-)
-
-// Location of syscall tests, relative to the repo root.
-const testDir = "test/syscalls/linux"
-
-var (
-	testName   = flag.String("test-name", "", "name of test binary to run")
-	debug      = flag.Bool("debug", false, "enable debug logs")
-	strace     = flag.Bool("strace", false, "enable strace logs")
-	platform   = flag.String("platform", "ptrace", "platform to run on")
-	network    = flag.String("network", "none", "network stack to run on (sandbox, host, none)")
-	useTmpfs   = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
-	fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
-	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
-	parallel   = flag.Bool("parallel", false, "run tests in parallel")
-	runscPath  = flag.String("runsc", "", "path to runsc binary")
-
-	addUDSTree = flag.Bool("add-uds-tree", false, "expose a tree of UDS utilities for use in tests")
-)
-
-// runTestCaseNative runs the test case directly on the host machine.
-func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) {
-	// These tests might be running in parallel, so make sure they have a
-	// unique test temp dir.
-	tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
-	if err != nil {
-		t.Fatalf("could not create temp dir: %v", err)
-	}
-	defer os.RemoveAll(tmpDir)
-
-	// Replace TEST_TMPDIR in the current environment with something
-	// unique.
-	env := os.Environ()
-	newEnvVar := "TEST_TMPDIR=" + tmpDir
-	var found bool
-	for i, kv := range env {
-		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
-			env[i] = newEnvVar
-			found = true
-			break
-		}
-	}
-	if !found {
-		env = append(env, newEnvVar)
-	}
-	// Remove env variables that cause the gunit binary to write output
-	// files, since they will stomp on eachother, and on the output files
-	// from this go test.
-	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
-
-	// Remove shard env variables so that the gunit binary does not try to
-	// intepret them.
-	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
-
-	if *addUDSTree {
-		socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
-		if err != nil {
-			t.Fatalf("failed to create socket tree: %v", err)
-		}
-		defer cleanup()
-
-		env = append(env, "TEST_UDS_TREE="+socketDir)
-		// On Linux, the concept of "attach" location doesn't exist.
-		// Just pass the same path to make these test identical.
-		env = append(env, "TEST_UDS_ATTACH_TREE="+socketDir)
-	}
-
-	cmd := exec.Command(testBin, gtest.FilterTestFlag+"="+tc.FullName())
-	cmd.Env = env
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	if err := cmd.Run(); err != nil {
-		ws := err.(*exec.ExitError).Sys().(syscall.WaitStatus)
-		t.Errorf("test %q exited with status %d, want 0", tc.FullName(), ws.ExitStatus())
-	}
-}
-
-// runRunsc runs spec in runsc in a standard test configuration.
-//
-// runsc logs will be saved to a path in TEST_UNDECLARED_OUTPUTS_DIR.
-//
-// Returns an error if the sandboxed application exits non-zero.
-func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
-	bundleDir, err := testutil.SetupBundleDir(spec)
-	if err != nil {
-		return fmt.Errorf("SetupBundleDir failed: %v", err)
-	}
-	defer os.RemoveAll(bundleDir)
-
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		return fmt.Errorf("SetupRootDir failed: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	name := tc.FullName()
-	id := testutil.UniqueContainerID()
-	log.Infof("Running test %q in container %q", name, id)
-	specutils.LogSpec(spec)
-
-	args := []string{
-		"-root", rootDir,
-		"-network", *network,
-		"-log-format=text",
-		"-TESTONLY-unsafe-nonroot=true",
-		"-net-raw=true",
-		fmt.Sprintf("-panic-signal=%d", syscall.SIGTERM),
-		"-watchdog-action=panic",
-		"-platform", *platform,
-		"-file-access", *fileAccess,
-	}
-	if *overlay {
-		args = append(args, "-overlay")
-	}
-	if *debug {
-		args = append(args, "-debug", "-log-packets=true")
-	}
-	if *strace {
-		args = append(args, "-strace")
-	}
-	if *addUDSTree {
-		args = append(args, "-fsgofer-host-uds")
-	}
-
-	if outDir, ok := syscall.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
-		tdir := filepath.Join(outDir, strings.Replace(name, "/", "_", -1))
-		if err := os.MkdirAll(tdir, 0755); err != nil {
-			return fmt.Errorf("could not create test dir: %v", err)
-		}
-		debugLogDir, err := ioutil.TempDir(tdir, "runsc")
-		if err != nil {
-			return fmt.Errorf("could not create temp dir: %v", err)
-		}
-		debugLogDir += "/"
-		log.Infof("runsc logs: %s", debugLogDir)
-		args = append(args, "-debug-log", debugLogDir)
-
-		// Default -log sends messages to stderr which makes reading the test log
-		// difficult. Instead, drop them when debug log is enabled given it's a
-		// better place for these messages.
-		args = append(args, "-log=/dev/null")
-	}
-
-	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
-	// as root inside that namespace to get it.
-	rArgs := append(args, "run", "--bundle", bundleDir, id)
-	cmd := exec.Command(*runscPath, rArgs...)
-	cmd.SysProcAttr = &syscall.SysProcAttr{
-		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
-		// Set current user/group as root inside the namespace.
-		UidMappings: []syscall.SysProcIDMap{
-			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
-		},
-		GidMappings: []syscall.SysProcIDMap{
-			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
-		},
-		GidMappingsEnableSetgroups: false,
-		Credential: &syscall.Credential{
-			Uid: 0,
-			Gid: 0,
-		},
-	}
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	sig := make(chan os.Signal, 1)
-	signal.Notify(sig, syscall.SIGTERM)
-	go func() {
-		s, ok := <-sig
-		if !ok {
-			return
-		}
-		log.Warningf("%s: Got signal: %v", name, s)
-		done := make(chan bool)
-		dArgs := append([]string{}, args...)
-		dArgs = append(dArgs, "-alsologtostderr=true", "debug", "--stacks", id)
-		go func(dArgs []string) {
-			cmd := exec.Command(*runscPath, dArgs...)
-			cmd.Stdout = os.Stdout
-			cmd.Stderr = os.Stderr
-			cmd.Run()
-			done <- true
-		}(dArgs)
-
-		timeout := time.After(3 * time.Second)
-		select {
-		case <-timeout:
-			log.Infof("runsc debug --stacks is timeouted")
-		case <-done:
-		}
-
-		log.Warningf("Send SIGTERM to the sandbox process")
-		dArgs = append(args, "debug",
-			fmt.Sprintf("--signal=%d", syscall.SIGTERM),
-			id)
-		cmd := exec.Command(*runscPath, dArgs...)
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
-		cmd.Run()
-	}()
-
-	err = cmd.Run()
-
-	signal.Stop(sig)
-	close(sig)
-
-	return err
-}
-
-// setupUDSTree updates the spec to expose a UDS tree for gofer socket testing.
-func setupUDSTree(spec *specs.Spec) (cleanup func(), err error) {
-	socketDir, cleanup, err := uds.CreateSocketTree("/tmp")
-	if err != nil {
-		return nil, fmt.Errorf("failed to create socket tree: %v", err)
-	}
-
-	// Standard access to entire tree.
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets",
-		Source:      socketDir,
-		Type:        "bind",
-	})
-
-	// Individial attach points for each socket to test mounts that attach
-	// directly to the sockets.
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/stream/echo",
-		Source:      filepath.Join(socketDir, "stream/echo"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/stream/nonlistening",
-		Source:      filepath.Join(socketDir, "stream/nonlistening"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/seqpacket/echo",
-		Source:      filepath.Join(socketDir, "seqpacket/echo"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/seqpacket/nonlistening",
-		Source:      filepath.Join(socketDir, "seqpacket/nonlistening"),
-		Type:        "bind",
-	})
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/tmp/sockets-attach/dgram/null",
-		Source:      filepath.Join(socketDir, "dgram/null"),
-		Type:        "bind",
-	})
-
-	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_TREE=/tmp/sockets")
-	spec.Process.Env = append(spec.Process.Env, "TEST_UDS_ATTACH_TREE=/tmp/sockets-attach")
-
-	return cleanup, nil
-}
-
-// runsTestCaseRunsc runs the test case in runsc.
-func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
-	// Run a new container with the test executable and filter for the
-	// given test suite and name.
-	spec := testutil.NewSpecWithArgs(testBin, gtest.FilterTestFlag+"="+tc.FullName())
-
-	// Mark the root as writeable, as some tests attempt to
-	// write to the rootfs, and expect EACCES, not EROFS.
-	spec.Root.Readonly = false
-
-	// Test spec comes with pre-defined mounts that we don't want. Reset it.
-	spec.Mounts = nil
-	if *useTmpfs {
-		// Forces '/tmp' to be mounted as tmpfs, otherwise test that rely on
-		// features only available in gVisor's internal tmpfs may fail.
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: "/tmp",
-			Type:        "tmpfs",
-		})
-	} else {
-		// Use a gofer-backed directory as '/tmp'.
-		//
-		// Tests might be running in parallel, so make sure each has a
-		// unique test temp dir.
-		//
-		// Some tests (e.g., sticky) access this mount from other
-		// users, so make sure it is world-accessible.
-		tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "")
-		if err != nil {
-			t.Fatalf("could not create temp dir: %v", err)
-		}
-		defer os.RemoveAll(tmpDir)
-
-		if err := os.Chmod(tmpDir, 0777); err != nil {
-			t.Fatalf("could not chmod temp dir: %v", err)
-		}
-
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Type:        "bind",
-			Destination: "/tmp",
-			Source:      tmpDir,
-		})
-	}
-
-	// Set environment variables that indicate we are
-	// running in gVisor with the given platform and network.
-	platformVar := "TEST_ON_GVISOR"
-	networkVar := "GVISOR_NETWORK"
-	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network)
-
-	// Remove env variables that cause the gunit binary to write output
-	// files, since they will stomp on eachother, and on the output files
-	// from this go test.
-	env = filterEnv(env, []string{"GUNIT_OUTPUT", "TEST_PREMATURE_EXIT_FILE", "XML_OUTPUT_FILE"})
-
-	// Remove shard env variables so that the gunit binary does not try to
-	// intepret them.
-	env = filterEnv(env, []string{"TEST_SHARD_INDEX", "TEST_TOTAL_SHARDS", "GTEST_SHARD_INDEX", "GTEST_TOTAL_SHARDS"})
-
-	// Set TEST_TMPDIR to /tmp, as some of the syscall tests require it to
-	// be backed by tmpfs.
-	for i, kv := range env {
-		if strings.HasPrefix(kv, "TEST_TMPDIR=") {
-			env[i] = "TEST_TMPDIR=/tmp"
-			break
-		}
-	}
-
-	spec.Process.Env = env
-
-	if *addUDSTree {
-		cleanup, err := setupUDSTree(spec)
-		if err != nil {
-			t.Fatalf("error creating UDS tree: %v", err)
-		}
-		defer cleanup()
-	}
-
-	if err := runRunsc(tc, spec); err != nil {
-		t.Errorf("test %q failed with error %v, want nil", tc.FullName(), err)
-	}
-}
-
-// filterEnv returns an environment with the blacklisted variables removed.
-func filterEnv(env, blacklist []string) []string {
-	var out []string
-	for _, kv := range env {
-		ok := true
-		for _, k := range blacklist {
-			if strings.HasPrefix(kv, k+"=") {
-				ok = false
-				break
-			}
-		}
-		if ok {
-			out = append(out, kv)
-		}
-	}
-	return out
-}
-
-func fatalf(s string, args ...interface{}) {
-	fmt.Fprintf(os.Stderr, s+"\n", args...)
-	os.Exit(1)
-}
-
-func matchString(a, b string) (bool, error) {
-	return a == b, nil
-}
-
-func main() {
-	flag.Parse()
-	if *testName == "" {
-		fatalf("test-name flag must be provided")
-	}
-
-	log.SetLevel(log.Info)
-	if *debug {
-		log.SetLevel(log.Debug)
-	}
-
-	if *platform != "native" && *runscPath == "" {
-		if err := testutil.ConfigureExePath(); err != nil {
-			panic(err.Error())
-		}
-		*runscPath = specutils.ExePath
-	}
-
-	// Make sure stdout and stderr are opened with O_APPEND, otherwise logs
-	// from outside the sandbox can (and will) stomp on logs from inside
-	// the sandbox.
-	for _, f := range []*os.File{os.Stdout, os.Stderr} {
-		flags, err := unix.FcntlInt(f.Fd(), unix.F_GETFL, 0)
-		if err != nil {
-			fatalf("error getting file flags for %v: %v", f, err)
-		}
-		if flags&unix.O_APPEND == 0 {
-			flags |= unix.O_APPEND
-			if _, err := unix.FcntlInt(f.Fd(), unix.F_SETFL, flags); err != nil {
-				fatalf("error setting file flags for %v: %v", f, err)
-			}
-		}
-	}
-
-	// Get path to test binary.
-	fullTestName := filepath.Join(testDir, *testName)
-	testBin, err := testutil.FindFile(fullTestName)
-	if err != nil {
-		fatalf("FindFile(%q) failed: %v", fullTestName, err)
-	}
-
-	// Get all test cases in each binary.
-	testCases, err := gtest.ParseTestCases(testBin)
-	if err != nil {
-		fatalf("ParseTestCases(%q) failed: %v", testBin, err)
-	}
-
-	// Get subset of tests corresponding to shard.
-	indices, err := testutil.TestIndicesForShard(len(testCases))
-	if err != nil {
-		fatalf("TestsForShard() failed: %v", err)
-	}
-
-	// Run the tests.
-	var tests []testing.InternalTest
-	for _, tci := range indices {
-		// Capture tc.
-		tc := testCases[tci]
-		testName := fmt.Sprintf("%s_%s", tc.Suite, tc.Name)
-		tests = append(tests, testing.InternalTest{
-			Name: testName,
-			F: func(t *testing.T) {
-				if *parallel {
-					t.Parallel()
-				}
-				if *platform == "native" {
-					// Run the test case on host.
-					runTestCaseNative(testBin, tc, t)
-				} else {
-					// Run the test case in runsc.
-					runTestCaseRunsc(testBin, tc, t)
-				}
-			},
-		})
-	}
-
-	testing.Main(matchString, tests, nil, nil)
-}
diff --git a/test/syscalls/syscall_test_runner.sh b/test/syscalls/syscall_test_runner.sh
deleted file mode 100755
index 864bb2de4..000000000
--- a/test/syscalls/syscall_test_runner.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# syscall_test_runner.sh is a simple wrapper around the go syscall test runner.
-# It exists so that we can build the syscall test runner once, and use it for
-# all syscall tests, rather than build it for each test run.
-
-set -euf -x -o pipefail
-
-echo -- "$@"
-
-if [[ -n "${TEST_UNDECLARED_OUTPUTS_DIR}" ]]; then
-  mkdir -p "${TEST_UNDECLARED_OUTPUTS_DIR}"
-  chmod a+rwx "${TEST_UNDECLARED_OUTPUTS_DIR}"
-fi
-
-# Get location of syscall_test_runner binary.
-readonly runner=$(find "${TEST_SRCDIR}" -name syscall_test_runner)
-
-# Pass the arguments of this script directly to the runner.
-exec "${runner}" "$@"
diff --git a/test/util/BUILD b/test/util/BUILD
index 1f22ebe29..8b5a0f25c 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "cc_library", "cc_test", "gtest", "select_system")
+load("//tools:defs.bzl", "cc_library", "cc_test", "gbenchmark", "gtest", "select_system")
 
 package(
     default_visibility = ["//:sandbox"],
@@ -260,6 +260,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
         gtest,
+        gbenchmark,
     ],
 )
 
diff --git a/test/util/test_main.cc b/test/util/test_main.cc
index 5c7ee0064..1f389e58f 100644
--- a/test/util/test_main.cc
+++ b/test/util/test_main.cc
@@ -16,5 +16,5 @@
 
 int main(int argc, char** argv) {
   gvisor::testing::TestInit(&argc, &argv);
-  return RUN_ALL_TESTS();
+  return gvisor::testing::RunAllTests();
 }
diff --git a/test/util/test_util.h b/test/util/test_util.h
index 2d22b0eb8..c5cb9d6d6 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -771,6 +771,7 @@ std::string RunfilePath(std::string path);
 #endif
 
 void TestInit(int* argc, char*** argv);
+int RunAllTests(void);
 
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/test_util_impl.cc b/test/util/test_util_impl.cc
index ba7c0a85b..7e1ad9e66 100644
--- a/test/util/test_util_impl.cc
+++ b/test/util/test_util_impl.cc
@@ -17,8 +17,12 @@
 #include "gtest/gtest.h"
 #include "absl/flags/flag.h"
 #include "absl/flags/parse.h"
+#include "benchmark/benchmark.h"
 #include "test/util/logging.h"
 
+extern bool FLAGS_benchmark_list_tests;
+extern std::string FLAGS_benchmark_filter;
+
 namespace gvisor {
 namespace testing {
 
@@ -26,6 +30,7 @@ void SetupGvisorDeathTest() {}
 
 void TestInit(int* argc, char*** argv) {
   ::testing::InitGoogleTest(argc, *argv);
+  benchmark::Initialize(argc, *argv);
   ::absl::ParseCommandLine(*argc, *argv);
 
   // Always mask SIGPIPE as it's common and tests aren't expected to handle it.
@@ -34,5 +39,14 @@ void TestInit(int* argc, char*** argv) {
   TEST_CHECK(sigaction(SIGPIPE, &sa, nullptr) == 0);
 }
 
+int RunAllTests() {
+  if (FLAGS_benchmark_list_tests || FLAGS_benchmark_filter != ".") {
+    benchmark::RunSpecifiedBenchmarks();
+    return 0;
+  } else {
+    return RUN_ALL_TESTS();
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl
index 6798362dc..6f091d759 100644
--- a/tools/bazeldefs/defs.bzl
+++ b/tools/bazeldefs/defs.bzl
@@ -21,6 +21,7 @@ go_image = _go_image
 go_embed_data = _go_embed_data
 go_suffixes = _go_suffixes
 gtest = "@com_google_googletest//:gtest"
+gbenchmark = "@com_google_benchmark//:benchmark"
 loopback = "//tools/bazeldefs:loopback"
 proto_library = native.proto_library
 pkg_deb = _pkg_deb
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 39f035f12..4eece2d83 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -7,7 +7,7 @@ change for Google-internal and bazel-compatible rules.
 
 load("//tools/go_stateify:defs.bzl", "go_stateify")
 load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps")
-load("//tools/bazeldefs:defs.bzl", "go_suffixes", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
+load("//tools/bazeldefs:defs.bzl", "go_suffixes", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system")
 
 # Delegate directly.
 cc_binary = _cc_binary
@@ -21,6 +21,7 @@ go_image = _go_image
 go_test = _go_test
 go_tool_library = _go_tool_library
 gtest = _gtest
+gbenchmark = _gbenchmark
 pkg_deb = _pkg_deb
 pkg_tar = _pkg_tar
 py_library = _py_library
-- 
cgit v1.2.3


From 4a73bae269ae9f52a962ae3b08a17ccaacf7ba80 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Thu, 20 Feb 2020 15:19:40 -0800
Subject: Initial network namespace support.

TCP/IP will work with netstack networking. hostinet doesn't work, and sockets
will have the same behavior as it is now.

Before the userspace is able to create device, the default loopback device can
be used to test.

/proc/net and /sys/net will still be connected to the root network stack; this
is the same behavior now.

Issue #1833

PiperOrigin-RevId: 296309389
---
 pkg/sentry/fs/proc/net.go                |   5 +-
 pkg/sentry/fs/proc/sys_net.go            |   4 +-
 pkg/sentry/fsimpl/proc/tasks_net.go      |   5 +-
 pkg/sentry/fsimpl/proc/tasks_sys.go      |   4 +-
 pkg/sentry/fsimpl/testutil/kernel.go     |   1 +
 pkg/sentry/inet/BUILD                    |   1 +
 pkg/sentry/inet/namespace.go             |  99 +++++++++++++++++++++++++
 pkg/sentry/kernel/kernel.go              |  26 ++++---
 pkg/sentry/kernel/task.go                |   9 +--
 pkg/sentry/kernel/task_clone.go          |  16 ++--
 pkg/sentry/kernel/task_net.go            |  19 +++--
 pkg/sentry/kernel/task_start.go          |   8 +-
 pkg/tcpip/time_unsafe.go                 |   2 +
 runsc/boot/BUILD                         |   2 +-
 runsc/boot/controller.go                 |  11 +--
 runsc/boot/loader.go                     | 121 +++++++++++++++++++++----------
 runsc/boot/network.go                    |  27 +++++++
 runsc/boot/pprof.go                      |  18 -----
 runsc/boot/pprof/BUILD                   |  11 +++
 runsc/boot/pprof/pprof.go                |  20 +++++
 runsc/sandbox/network.go                 |  25 +------
 test/syscalls/BUILD                      |   2 +
 test/syscalls/linux/BUILD                |  17 +++++
 test/syscalls/linux/network_namespace.cc | 121 +++++++++++++++++++++++++++++++
 24 files changed, 451 insertions(+), 123 deletions(-)
 create mode 100644 pkg/sentry/inet/namespace.go
 delete mode 100644 runsc/boot/pprof.go
 create mode 100644 runsc/boot/pprof/BUILD
 create mode 100644 runsc/boot/pprof/pprof.go
 create mode 100644 test/syscalls/linux/network_namespace.cc

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 6f2775344..95d5817ff 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -43,7 +43,10 @@ import (
 // newNet creates a new proc net entry.
 func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
-	if s := p.k.NetworkStack(); s != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process. We should make this per-process,
+	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
+	if s := p.k.RootNetworkNamespace().Stack(); s != nil {
 		contents = map[string]*fs.Inode{
 			"dev":  seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
 			"snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc),
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 0772d4ae4..d4c4b533d 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -357,7 +357,9 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine
 
 func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
-	if s := p.k.NetworkStack(); s != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process.
+	if s := p.k.RootNetworkNamespace().Stack(); s != nil {
 		contents = map[string]*fs.Inode{
 			"ipv4": p.newSysNetIPv4Dir(ctx, msrc, s),
 			"core": p.newSysNetCore(ctx, msrc, s),
diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go
index 608fec017..d4e1812d8 100644
--- a/pkg/sentry/fsimpl/proc/tasks_net.go
+++ b/pkg/sentry/fsimpl/proc/tasks_net.go
@@ -39,7 +39,10 @@ import (
 
 func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
 	var contents map[string]*kernfs.Dentry
-	if stack := k.NetworkStack(); stack != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process. We should make this per-process,
+	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
+	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
 		const (
 			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
 			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index c7ce74883..3d5dc463c 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -50,7 +50,9 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k
 func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
 	var contents map[string]*kernfs.Dentry
 
-	if stack := k.NetworkStack(); stack != nil {
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process.
+	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
 		contents = map[string]*kernfs.Dentry{
 			"ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{
 				"tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}),
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index d0be32e72..488478e29 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -128,6 +128,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
 		ThreadGroup:             tc,
 		TaskContext:             &kernel.TaskContext{Name: name},
 		Credentials:             auth.CredentialsFromContext(ctx),
+		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.ApplicationCores()),
 		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
 		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 334432abf..07bf39fed 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -10,6 +10,7 @@ go_library(
     srcs = [
         "context.go",
         "inet.go",
+        "namespace.go",
         "test_stack.go",
     ],
     deps = [
diff --git a/pkg/sentry/inet/namespace.go b/pkg/sentry/inet/namespace.go
new file mode 100644
index 000000000..c16667e7f
--- /dev/null
+++ b/pkg/sentry/inet/namespace.go
@@ -0,0 +1,99 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package inet
+
+// Namespace represents a network namespace. See network_namespaces(7).
+//
+// +stateify savable
+type Namespace struct {
+	// stack is the network stack implementation of this network namespace.
+	stack Stack `state:"nosave"`
+
+	// creator allows kernel to create new network stack for network namespaces.
+	// If nil, no networking will function if network is namespaced.
+	creator NetworkStackCreator
+
+	// isRoot indicates whether this is the root network namespace.
+	isRoot bool
+}
+
+// NewRootNamespace creates the root network namespace, with creator
+// allowing new network namespaces to be created. If creator is nil, no
+// networking will function if the network is namespaced.
+func NewRootNamespace(stack Stack, creator NetworkStackCreator) *Namespace {
+	return &Namespace{
+		stack:   stack,
+		creator: creator,
+		isRoot:  true,
+	}
+}
+
+// NewNamespace creates a new network namespace from the root.
+func NewNamespace(root *Namespace) *Namespace {
+	n := &Namespace{
+		creator: root.creator,
+	}
+	n.init()
+	return n
+}
+
+// Stack returns the network stack of n. Stack may return nil if no network
+// stack is configured.
+func (n *Namespace) Stack() Stack {
+	return n.stack
+}
+
+// IsRoot returns whether n is the root network namespace.
+func (n *Namespace) IsRoot() bool {
+	return n.isRoot
+}
+
+// RestoreRootStack restores the root network namespace with stack. This should
+// only be called when restoring kernel.
+func (n *Namespace) RestoreRootStack(stack Stack) {
+	if !n.isRoot {
+		panic("RestoreRootStack can only be called on root network namespace")
+	}
+	if n.stack != nil {
+		panic("RestoreRootStack called after a stack has already been set")
+	}
+	n.stack = stack
+}
+
+func (n *Namespace) init() {
+	// Root network namespace will have stack assigned later.
+	if n.isRoot {
+		return
+	}
+	if n.creator != nil {
+		var err error
+		n.stack, err = n.creator.CreateStack()
+		if err != nil {
+			panic(err)
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (n *Namespace) afterLoad() {
+	n.init()
+}
+
+// NetworkStackCreator allows new instances of a network stack to be created. It
+// is used by the kernel to create new network namespaces when requested.
+type NetworkStackCreator interface {
+	// CreateStack creates a new network stack for a network namespace.
+	CreateStack() (Stack, error)
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 7da0368f1..c62fd6eb1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -111,7 +111,7 @@ type Kernel struct {
 	timekeeper                  *Timekeeper
 	tasks                       *TaskSet
 	rootUserNamespace           *auth.UserNamespace
-	networkStack                inet.Stack `state:"nosave"`
+	rootNetworkNamespace        *inet.Namespace
 	applicationCores            uint
 	useHostCores                bool
 	extraAuxv                   []arch.AuxEntry
@@ -260,8 +260,9 @@ type InitKernelArgs struct {
 	// RootUserNamespace is the root user namespace.
 	RootUserNamespace *auth.UserNamespace
 
-	// NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
-	NetworkStack inet.Stack
+	// RootNetworkNamespace is the root network namespace. If nil, no networking
+	// will be available.
+	RootNetworkNamespace *inet.Namespace
 
 	// ApplicationCores is the number of logical CPUs visible to sandboxed
 	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
@@ -320,7 +321,10 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.rootUTSNamespace = args.RootUTSNamespace
 	k.rootIPCNamespace = args.RootIPCNamespace
 	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
-	k.networkStack = args.NetworkStack
+	k.rootNetworkNamespace = args.RootNetworkNamespace
+	if k.rootNetworkNamespace == nil {
+		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil)
+	}
 	k.applicationCores = args.ApplicationCores
 	if args.UseHostCores {
 		k.useHostCores = true
@@ -543,8 +547,6 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
 	loadStart := time.Now()
 
-	k.networkStack = net
-
 	initAppCores := k.applicationCores
 
 	// Load the pre-saved CPUID FeatureSet.
@@ -575,6 +577,10 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
 	log.Infof("Kernel load stats: %s", &stats)
 	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
 
+	// rootNetworkNamespace should be populated after loading the state file.
+	// Restore the root network stack.
+	k.rootNetworkNamespace.RestoreRootStack(net)
+
 	// Load the memory file's state.
 	memoryStart := time.Now()
 	if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
@@ -905,6 +911,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		FSContext:               fsContext,
 		FDTable:                 args.FDTable,
 		Credentials:             args.Credentials,
+		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
@@ -1255,10 +1262,9 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
 	return k.rootAbstractSocketNamespace
 }
 
-// NetworkStack returns the network stack. NetworkStack may return nil if no
-// network stack is available.
-func (k *Kernel) NetworkStack() inet.Stack {
-	return k.networkStack
+// RootNetworkNamespace returns the root network namespace, always non-nil.
+func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
+	return k.rootNetworkNamespace
 }
 
 // GlobalInit returns the thread group with ID 1 in the root PID namespace, or
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index a3443ff21..e37e23231 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -486,13 +486,10 @@ type Task struct {
 	numaPolicy   int32
 	numaNodeMask uint64
 
-	// If netns is true, the task is in a non-root network namespace. Network
-	// namespaces aren't currently implemented in full; being in a network
-	// namespace simply prevents the task from observing any network devices
-	// (including loopback) or using abstract socket addresses (see unix(7)).
+	// netns is the task's network namespace. netns is never nil.
 	//
-	// netns is protected by mu. netns is owned by the task goroutine.
-	netns bool
+	// netns is protected by mu.
+	netns *inet.Namespace
 
 	// If rseqPreempted is true, before the next call to p.Switch(),
 	// interrupt rseq critical regions as defined by rseqAddr and
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index ba74b4c1c..78866f280 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -54,8 +55,7 @@ type SharingOptions struct {
 	NewUserNamespace bool
 
 	// If NewNetworkNamespace is true, the task should have an independent
-	// network namespace. (Note that network namespaces are not really
-	// implemented; see comment on Task.netns for details.)
+	// network namespace.
 	NewNetworkNamespace bool
 
 	// If NewFiles is true, the task should use an independent file descriptor
@@ -199,6 +199,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		ipcns = NewIPCNamespace(userns)
 	}
 
+	netns := t.NetworkNamespace()
+	if opts.NewNetworkNamespace {
+		netns = inet.NewNamespace(netns)
+	}
+
 	// TODO(b/63601033): Implement CLONE_NEWNS.
 	mntnsVFS2 := t.mountNamespaceVFS2
 	if mntnsVFS2 != nil {
@@ -268,7 +273,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		FDTable:                 fdTable,
 		Credentials:             creds,
 		Niceness:                t.Niceness(),
-		NetworkNamespaced:       t.netns,
+		NetworkNamespace:        netns,
 		AllowedCPUMask:          t.CPUMask(),
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
@@ -283,9 +288,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	} else {
 		cfg.InheritParent = t
 	}
-	if opts.NewNetworkNamespace {
-		cfg.NetworkNamespaced = true
-	}
 	nt, err := t.tg.pidns.owner.NewTask(cfg)
 	if err != nil {
 		if opts.NewThreadGroup {
@@ -482,7 +484,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 			t.mu.Unlock()
 			return syserror.EPERM
 		}
-		t.netns = true
+		t.netns = inet.NewNamespace(t.netns)
 	}
 	if opts.NewUTSNamespace {
 		if !haveCapSysAdmin {
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
index 172a31e1d..f7711232c 100644
--- a/pkg/sentry/kernel/task_net.go
+++ b/pkg/sentry/kernel/task_net.go
@@ -22,14 +22,23 @@ import (
 func (t *Task) IsNetworkNamespaced() bool {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	return t.netns
+	return !t.netns.IsRoot()
 }
 
 // NetworkContext returns the network stack used by the task. NetworkContext
 // may return nil if no network stack is available.
+//
+// TODO(gvisor.dev/issue/1833): Migrate callers of this method to
+// NetworkNamespace().
 func (t *Task) NetworkContext() inet.Stack {
-	if t.IsNetworkNamespaced() {
-		return nil
-	}
-	return t.k.networkStack
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns.Stack()
+}
+
+// NetworkNamespace returns the network namespace observed by the task.
+func (t *Task) NetworkNamespace() *inet.Namespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns
 }
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index f9236a842..a5035bb7f 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
@@ -65,9 +66,8 @@ type TaskConfig struct {
 	// Niceness is the niceness of the new task.
 	Niceness int
 
-	// If NetworkNamespaced is true, the new task should observe a non-root
-	// network namespace.
-	NetworkNamespaced bool
+	// NetworkNamespace is the network namespace to be used for the new task.
+	NetworkNamespace *inet.Namespace
 
 	// AllowedCPUMask contains the cpus that this task can run on.
 	AllowedCPUMask sched.CPUSet
@@ -133,7 +133,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		allowedCPUMask:     cfg.AllowedCPUMask.Copy(),
 		ioUsage:            &usage.IO{},
 		niceness:           cfg.Niceness,
-		netns:              cfg.NetworkNamespaced,
+		netns:              cfg.NetworkNamespace,
 		utsns:              cfg.UTSNamespace,
 		ipcns:              cfg.IPCNamespace,
 		abstractSockets:    cfg.AbstractSocketNamespace,
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 48764b978..2f98a996f 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -25,6 +25,8 @@ import (
 )
 
 // StdClock implements Clock with the time package.
+//
+// +stateify savable
 type StdClock struct{}
 
 var _ Clock = (*StdClock)(nil)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index ae4dd102a..26f68fe3d 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -19,7 +19,6 @@ go_library(
         "loader_amd64.go",
         "loader_arm64.go",
         "network.go",
-        "pprof.go",
         "strace.go",
         "user.go",
     ],
@@ -91,6 +90,7 @@ go_library(
         "//pkg/usermem",
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
+        "//runsc/boot/pprof",
         "//runsc/specutils",
         "@com_github_golang_protobuf//proto:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 9c9e94864..17e774e0c 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -142,7 +143,7 @@ func newController(fd int, l *Loader) (*controller, error) {
 	}
 	srv.Register(manager)
 
-	if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok {
+	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
 		net := &Network{
 			Stack: eps.Stack,
 		}
@@ -341,7 +342,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("creating memory file: %v", err)
 	}
 	k.SetMemoryFile(mf)
-	networkStack := cm.l.k.NetworkStack()
+	networkStack := cm.l.k.RootNetworkNamespace().Stack()
 	cm.l.k = k
 
 	// Set up the restore environment.
@@ -365,9 +366,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	}
 
 	if cm.l.conf.ProfileEnable {
-		// initializePProf opens /proc/self/maps, so has to be
-		// called before installing seccomp filters.
-		initializePProf()
+		// pprof.Initialize opens /proc/self/maps, so has to be called before
+		// installing seccomp filters.
+		pprof.Initialize()
 	}
 
 	// Seccomp filters have to be applied before parsing the state file.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index eef43b9df..e7ca98134 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -49,6 +49,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -60,6 +61,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/runsc/boot/filter"
 	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 
 	// Include supported socket providers.
@@ -230,11 +232,8 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("enabling strace: %v", err)
 	}
 
-	// Create an empty network stack because the network namespace may be empty at
-	// this point. Netns is configured before Run() is called. Netstack is
-	// configured using a control uRPC message. Host network is configured inside
-	// Run().
-	networkStack, err := newEmptyNetworkStack(args.Conf, k, k)
+	// Create root network namespace/stack.
+	netns, err := newRootNetworkNamespace(args.Conf, k, k)
 	if err != nil {
 		return nil, fmt.Errorf("creating network: %v", err)
 	}
@@ -277,7 +276,7 @@ func New(args Args) (*Loader, error) {
 		FeatureSet:                  cpuid.HostFeatureSet(),
 		Timekeeper:                  tk,
 		RootUserNamespace:           creds.UserNamespace,
-		NetworkStack:                networkStack,
+		RootNetworkNamespace:        netns,
 		ApplicationCores:            uint(args.NumCPU),
 		Vdso:                        vdso,
 		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
@@ -466,7 +465,7 @@ func (l *Loader) run() error {
 		// Delay host network configuration to this point because network namespace
 		// is configured after the loader is created and before Run() is called.
 		log.Debugf("Configuring host network")
-		stack := l.k.NetworkStack().(*hostinet.Stack)
+		stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
 		if err := stack.Configure(); err != nil {
 			return err
 		}
@@ -485,7 +484,7 @@ func (l *Loader) run() error {
 	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
 		if l.conf.ProfileEnable {
-			initializePProf()
+			pprof.Initialize()
 		}
 
 		// Finally done with all configuration. Setup filters before user code
@@ -908,48 +907,92 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
 	switch conf.Network {
 	case NetworkHost:
-		return hostinet.NewStack(), nil
+		// No network namespacing support for hostinet yet, hence creator is nil.
+		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
 
 	case NetworkNone, NetworkSandbox:
-		// NetworkNone sets up loopback using netstack.
-		netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
-		transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
-		s := netstack.Stack{stack.New(stack.Options{
-			NetworkProtocols:   netProtos,
-			TransportProtocols: transProtos,
-			Clock:              clock,
-			Stats:              netstack.Metrics,
-			HandleLocal:        true,
-			// Enable raw sockets for users with sufficient
-			// privileges.
-			RawFactory: raw.EndpointFactory{},
-			UniqueID:   uniqueID,
-		})}
-
-		// Enable SACK Recovery.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
-			return nil, fmt.Errorf("failed to enable SACK: %v", err)
+		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
+		if err != nil {
+			return nil, err
 		}
+		creator := &sandboxNetstackCreator{
+			clock:    clock,
+			uniqueID: uniqueID,
+		}
+		return inet.NewRootNamespace(s, creator), nil
 
-		// Set default TTLs as required by socket/netstack.
-		s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
-		s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	default:
+		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	}
 
-		// Enable Receive Buffer Auto-Tuning.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-			return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
-		}
+}
 
-		s.FillDefaultIPTables()
+func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	s := netstack.Stack{stack.New(stack.Options{
+		NetworkProtocols:   netProtos,
+		TransportProtocols: transProtos,
+		Clock:              clock,
+		Stats:              netstack.Metrics,
+		HandleLocal:        true,
+		// Enable raw sockets for users with sufficient
+		// privileges.
+		RawFactory: raw.EndpointFactory{},
+		UniqueID:   uniqueID,
+	})}
 
-		return &s, nil
+	// Enable SACK Recovery.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+		return nil, fmt.Errorf("failed to enable SACK: %v", err)
+	}
 
-	default:
-		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	// Set default TTLs as required by socket/netstack.
+	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+
+	// Enable Receive Buffer Auto-Tuning.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+	}
+
+	s.FillDefaultIPTables()
+
+	return &s, nil
+}
+
+// sandboxNetstackCreator implements kernel.NetworkStackCreator.
+//
+// +stateify savable
+type sandboxNetstackCreator struct {
+	clock    tcpip.Clock
+	uniqueID stack.UniqueID
+}
+
+// CreateStack implements kernel.NetworkStackCreator.CreateStack.
+func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
+	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
+	if err != nil {
+		return nil, err
 	}
+
+	// Setup loopback.
+	n := &Network{Stack: s.(*netstack.Stack).Stack}
+	nicID := tcpip.NICID(f.uniqueID.UniqueID())
+	link := DefaultLoopbackLink
+	linkEP := loopback.New()
+	if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		return nil, err
+	}
+
+	return s, nil
 }
 
 // signal sends a signal to one or more processes in a container. If PID is 0,
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 6a8765ec8..bee6ee336 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -17,6 +17,7 @@ package boot
 import (
 	"fmt"
 	"net"
+	"strings"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/log"
@@ -31,6 +32,32 @@ import (
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
+var (
+	// DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and
+	// "::1/8" on "lo" interface.
+	DefaultLoopbackLink = LoopbackLink{
+		Name: "lo",
+		Addresses: []net.IP{
+			net.IP("\x7f\x00\x00\x01"),
+			net.IPv6loopback,
+		},
+		Routes: []Route{
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv4(0x7f, 0, 0, 0),
+					Mask: net.IPv4Mask(0xff, 0, 0, 0),
+				},
+			},
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv6loopback,
+					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
+				},
+			},
+		},
+	}
+)
+
 // Network exposes methods that can be used to configure a network stack.
 type Network struct {
 	Stack *stack.Stack
diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof.go
deleted file mode 100644
index 463362f02..000000000
--- a/runsc/boot/pprof.go
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-func initializePProf() {
-}
diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD
new file mode 100644
index 000000000..29cb42b2f
--- /dev/null
+++ b/runsc/boot/pprof/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pprof",
+    srcs = ["pprof.go"],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+)
diff --git a/runsc/boot/pprof/pprof.go b/runsc/boot/pprof/pprof.go
new file mode 100644
index 000000000..1ded20dee
--- /dev/null
+++ b/runsc/boot/pprof/pprof.go
@@ -0,0 +1,20 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pprof provides a stub to initialize custom profilers.
+package pprof
+
+// Initialize will be called at boot for initializing custom profilers.
+func Initialize() {
+}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 99e143696..bc093fba5 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -21,7 +21,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strconv"
-	"strings"
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -75,30 +74,8 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 }
 
 func createDefaultLoopbackInterface(conn *urpc.Client) error {
-	link := boot.LoopbackLink{
-		Name: "lo",
-		Addresses: []net.IP{
-			net.IP("\x7f\x00\x00\x01"),
-			net.IPv6loopback,
-		},
-		Routes: []boot.Route{
-			{
-				Destination: net.IPNet{
-
-					IP:   net.IPv4(0x7f, 0, 0, 0),
-					Mask: net.IPv4Mask(0xff, 0, 0, 0),
-				},
-			},
-			{
-				Destination: net.IPNet{
-					IP:   net.IPv6loopback,
-					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
-				},
-			},
-		},
-	}
 	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
-		LoopbackLinks: []boot.LoopbackLink{link},
+		LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink},
 	}, nil); err != nil {
 		return fmt.Errorf("creating loopback link and routes: %v", err)
 	}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index d69ac8356..d1977d4de 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -258,6 +258,8 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:munmap_test")
 
+syscall_test(test = "//test/syscalls/linux:network_namespace_test")
+
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:open_create_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 05a818795..aa303af84 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3639,6 +3639,23 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "network_namespace_test",
+    testonly = 1,
+    srcs = ["network_namespace.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        gtest,
+        "//test/util:capability_util",
+        "//test/util:memory_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 cc_binary(
     name = "semaphore_test",
     testonly = 1,
diff --git a/test/syscalls/linux/network_namespace.cc b/test/syscalls/linux/network_namespace.cc
new file mode 100644
index 000000000..6ea48c263
--- /dev/null
+++ b/test/syscalls/linux/network_namespace.cc
@@ -0,0 +1,121 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <net/if.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/synchronization/notification.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using TestFunc = std::function<PosixError()>;
+using RunFunc = std::function<PosixError(TestFunc)>;
+
+struct NamespaceStrategy {
+  RunFunc run;
+
+  static NamespaceStrategy Of(RunFunc run) {
+    NamespaceStrategy s;
+    s.run = run;
+    return s;
+  }
+};
+
+PosixError RunWithUnshare(TestFunc fn) {
+  PosixError err = PosixError(-1, "function did not return a value");
+  ScopedThread t([&] {
+    if (unshare(CLONE_NEWNET) != 0) {
+      err = PosixError(errno);
+      return;
+    }
+    err = fn();
+  });
+  t.Join();
+  return err;
+}
+
+PosixError RunWithClone(TestFunc fn) {
+  struct Args {
+    absl::Notification n;
+    TestFunc fn;
+    PosixError err;
+  };
+  Args args;
+  args.fn = fn;
+  args.err = PosixError(-1, "function did not return a value");
+
+  ASSIGN_OR_RETURN_ERRNO(
+      Mapping child_stack,
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  pid_t child = clone(
+      +[](void *arg) {
+        Args *args = reinterpret_cast<Args *>(arg);
+        args->err = args->fn();
+        args->n.Notify();
+        syscall(SYS_exit, 0);  // Exit manually. No return address on stack.
+        return 0;
+      },
+      reinterpret_cast<void *>(child_stack.addr() + kPageSize),
+      CLONE_NEWNET | CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, &args);
+  if (child < 0) {
+    return PosixError(errno, "clone() failed");
+  }
+  args.n.WaitForNotification();
+  return args.err;
+}
+
+class NetworkNamespaceTest
+    : public ::testing::TestWithParam<NamespaceStrategy> {};
+
+TEST_P(NetworkNamespaceTest, LoopbackExists) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  EXPECT_NO_ERRNO(GetParam().run([]() {
+    // TODO(gvisor.dev/issue/1833): Update this to test that only "lo" exists.
+    // Check loopback device exists.
+    int sock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (sock < 0) {
+      return PosixError(errno, "socket() failed");
+    }
+    struct ifreq ifr;
+    snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+    if (ioctl(sock, SIOCGIFINDEX, &ifr) < 0) {
+      return PosixError(errno, "ioctl() failed, lo cannot be found");
+    }
+    return NoError();
+  }));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllNetworkNamespaceTest, NetworkNamespaceTest,
+    ::testing::Values(NamespaceStrategy::Of(RunWithUnshare),
+                      NamespaceStrategy::Of(RunWithClone)));
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From b8f56c79be40d9c75f4e2f279c9d821d1c1c3569 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 21 Feb 2020 15:41:56 -0800
Subject: Implement tap/tun device in vfs.

PiperOrigin-RevId: 296526279
---
 pkg/abi/linux/BUILD                              |   1 +
 pkg/abi/linux/ioctl.go                           |  26 ++
 pkg/abi/linux/ioctl_tun.go                       |  29 ++
 pkg/sentry/fs/dev/BUILD                          |   5 +
 pkg/sentry/fs/dev/dev.go                         |  10 +-
 pkg/sentry/fs/dev/net_tun.go                     | 170 +++++++++++
 pkg/syserror/syserror.go                         |   1 +
 pkg/tcpip/buffer/view.go                         |   6 +
 pkg/tcpip/link/channel/BUILD                     |   1 +
 pkg/tcpip/link/channel/channel.go                | 180 +++++++++---
 pkg/tcpip/link/tun/BUILD                         |  18 +-
 pkg/tcpip/link/tun/device.go                     | 352 +++++++++++++++++++++++
 pkg/tcpip/link/tun/protocol.go                   |  56 ++++
 pkg/tcpip/stack/nic.go                           |  32 +++
 pkg/tcpip/stack/stack.go                         |  39 +++
 test/syscalls/BUILD                              |   2 +
 test/syscalls/linux/BUILD                        |  30 ++
 test/syscalls/linux/dev.cc                       |   7 +
 test/syscalls/linux/socket_netlink_route_util.cc | 163 +++++++++++
 test/syscalls/linux/socket_netlink_route_util.h  |  55 ++++
 test/syscalls/linux/tuntap.cc                    | 346 ++++++++++++++++++++++
 21 files changed, 1490 insertions(+), 39 deletions(-)
 create mode 100644 pkg/abi/linux/ioctl_tun.go
 create mode 100644 pkg/sentry/fs/dev/net_tun.go
 create mode 100644 pkg/tcpip/link/tun/device.go
 create mode 100644 pkg/tcpip/link/tun/protocol.go
 create mode 100644 test/syscalls/linux/socket_netlink_route_util.cc
 create mode 100644 test/syscalls/linux/socket_netlink_route_util.h
 create mode 100644 test/syscalls/linux/tuntap.cc

(limited to 'test/syscalls/linux')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index a89f34d4b..322d1ccc4 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -30,6 +30,7 @@ go_library(
         "futex.go",
         "inotify.go",
         "ioctl.go",
+        "ioctl_tun.go",
         "ip.go",
         "ipc.go",
         "limits.go",
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 0e18db9ef..2062e6a4b 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -72,3 +72,29 @@ const (
 	SIOCGMIIPHY   = 0x8947
 	SIOCGMIIREG   = 0x8948
 )
+
+// ioctl(2) directions. Used to calculate requests number.
+// Constants from asm-generic/ioctl.h.
+const (
+	_IOC_NONE  = 0
+	_IOC_WRITE = 1
+	_IOC_READ  = 2
+)
+
+// Constants from asm-generic/ioctl.h.
+const (
+	_IOC_NRBITS   = 8
+	_IOC_TYPEBITS = 8
+	_IOC_SIZEBITS = 14
+	_IOC_DIRBITS  = 2
+
+	_IOC_NRSHIFT   = 0
+	_IOC_TYPESHIFT = _IOC_NRSHIFT + _IOC_NRBITS
+	_IOC_SIZESHIFT = _IOC_TYPESHIFT + _IOC_TYPEBITS
+	_IOC_DIRSHIFT  = _IOC_SIZESHIFT + _IOC_SIZEBITS
+)
+
+// IOC outputs the result of _IOC macro in asm-generic/ioctl.h.
+func IOC(dir, typ, nr, size uint32) uint32 {
+	return uint32(dir)<<_IOC_DIRSHIFT | typ<<_IOC_TYPESHIFT | nr<<_IOC_NRSHIFT | size<<_IOC_SIZESHIFT
+}
diff --git a/pkg/abi/linux/ioctl_tun.go b/pkg/abi/linux/ioctl_tun.go
new file mode 100644
index 000000000..c59c9c136
--- /dev/null
+++ b/pkg/abi/linux/ioctl_tun.go
@@ -0,0 +1,29 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// ioctl(2) request numbers from linux/if_tun.h
+var (
+	TUNSETIFF = IOC(_IOC_WRITE, 'T', 202, 4)
+	TUNGETIFF = IOC(_IOC_READ, 'T', 210, 4)
+)
+
+// Flags from net/if_tun.h
+const (
+	IFF_TUN      = 0x0001
+	IFF_TAP      = 0x0002
+	IFF_NO_PI    = 0x1000
+	IFF_NOFILTER = 0x1000
+)
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 4c4b7d5cc..9b6bb26d0 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -9,6 +9,7 @@ go_library(
         "device.go",
         "fs.go",
         "full.go",
+        "net_tun.go",
         "null.go",
         "random.go",
         "tty.go",
@@ -19,15 +20,19 @@ go_library(
         "//pkg/context",
         "//pkg/rand",
         "//pkg/safemem",
+        "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
         "//pkg/sentry/pgalloc",
+        "//pkg/sentry/socket/netstack",
         "//pkg/syserror",
+        "//pkg/tcpip/link/tun",
         "//pkg/usermem",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 35bd23991..7e66c29b0 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -66,8 +66,8 @@ func newMemDevice(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSo
 	})
 }
 
-func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	iops := ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+func newDirectory(ctx context.Context, contents map[string]*fs.Inode, msrc *fs.MountSource) *fs.Inode {
+	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(ctx, iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
 		InodeID:   devDevice.NextIno(),
@@ -111,7 +111,7 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		// A devpts is typically mounted at /dev/pts to provide
 		// pseudoterminal support. Place an empty directory there for
 		// the devpts to be mounted over.
-		"pts": newDirectory(ctx, msrc),
+		"pts": newDirectory(ctx, nil, msrc),
 		// Similarly, applications expect a ptmx device at /dev/ptmx
 		// connected to the terminals provided by /dev/pts/. Rather
 		// than creating a device directly (which requires a hairy
@@ -124,6 +124,10 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		"ptmx": newSymlink(ctx, "pts/ptmx", msrc),
 
 		"tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor),
+
+		"net": newDirectory(ctx, map[string]*fs.Inode{
+			"tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor),
+		}, msrc),
 	}
 
 	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
new file mode 100644
index 000000000..755644488
--- /dev/null
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -0,0 +1,170 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dev
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	netTunDevMajor = 10
+	netTunDevMinor = 200
+)
+
+// +stateify savable
+type netTunInodeOperations struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+}
+
+var _ fs.InodeOperations = (*netTunInodeOperations)(nil)
+
+func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *netTunInodeOperations {
+	return &netTunInodeOperations{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+	}
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil
+}
+
+// +stateify savable
+type netTunFileOperations struct {
+	fsutil.FileNoSeek               `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	device tun.Device
+}
+
+var _ fs.FileOperations = (*netTunFileOperations)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (fops *netTunFileOperations) Release() {
+	fops.device.Release()
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	request := args[1].Uint()
+	data := args[2].Pointer()
+
+	switch request {
+	case linux.TUNSETIFF:
+		t := kernel.TaskFromContext(ctx)
+		if t == nil {
+			panic("Ioctl should be called from a task context")
+		}
+		if !t.HasCapability(linux.CAP_NET_ADMIN) {
+			return 0, syserror.EPERM
+		}
+		stack, ok := t.NetworkContext().(*netstack.Stack)
+		if !ok {
+			return 0, syserror.EINVAL
+		}
+
+		var req linux.IFReq
+		if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		flags := usermem.ByteOrder.Uint16(req.Data[:])
+		return 0, fops.device.SetIff(stack.Stack, req.Name(), flags)
+
+	case linux.TUNGETIFF:
+		var req linux.IFReq
+
+		copy(req.IFName[:], fops.device.Name())
+
+		// Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when
+		// there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c.
+		flags := fops.device.Flags() | linux.IFF_NOFILTER
+		usermem.ByteOrder.PutUint16(req.Data[:], flags)
+
+		_, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// Write implements fs.FileOperations.Write.
+func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	data := make([]byte, src.NumBytes())
+	if _, err := src.CopyIn(ctx, data); err != nil {
+		return 0, err
+	}
+	return fops.device.Write(data)
+}
+
+// Read implements fs.FileOperations.Read.
+func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	data, err := fops.device.Read()
+	if err != nil {
+		return 0, err
+	}
+	n, err := dst.CopyOut(ctx, data)
+	if n > 0 && n < len(data) {
+		// Not an error for partial copying. Packet truncated.
+		err = nil
+	}
+	return int64(n), err
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fops.device.Readiness(mask)
+}
+
+// EventRegister implements watier.Waitable.EventRegister.
+func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fops.device.EventRegister(e, mask)
+}
+
+// EventUnregister implements watier.Waitable.EventUnregister.
+func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
+	fops.device.EventUnregister(e)
+}
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 2269f6237..4b5a0fca6 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -29,6 +29,7 @@ var (
 	EACCES       = error(syscall.EACCES)
 	EAGAIN       = error(syscall.EAGAIN)
 	EBADF        = error(syscall.EBADF)
+	EBADFD       = error(syscall.EBADFD)
 	EBUSY        = error(syscall.EBUSY)
 	ECHILD       = error(syscall.ECHILD)
 	ECONNREFUSED = error(syscall.ECONNREFUSED)
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 150310c11..17e94c562 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -156,3 +156,9 @@ func (vv *VectorisedView) Append(vv2 VectorisedView) {
 	vv.views = append(vv.views, vv2.views...)
 	vv.size += vv2.size
 }
+
+// AppendView appends the given view into this vectorised view.
+func (vv *VectorisedView) AppendView(v View) {
+	vv.views = append(vv.views, v)
+	vv.size += len(v)
+}
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 3974c464e..b8b93e78e 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = ["channel.go"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/stack",
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 78d447acd..5944ba190 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -20,6 +20,7 @@ package channel
 import (
 	"context"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -33,6 +34,118 @@ type PacketInfo struct {
 	Route stack.Route
 }
 
+// Notification is the interface for receiving notification from the packet
+// queue.
+type Notification interface {
+	// WriteNotify will be called when a write happens to the queue.
+	WriteNotify()
+}
+
+// NotificationHandle is an opaque handle to the registered notification target.
+// It can be used to unregister the notification when no longer interested.
+//
+// +stateify savable
+type NotificationHandle struct {
+	n Notification
+}
+
+type queue struct {
+	// mu protects fields below.
+	mu sync.RWMutex
+	// c is the outbound packet channel. Sending to c should hold mu.
+	c        chan PacketInfo
+	numWrite int
+	numRead  int
+	notify   []*NotificationHandle
+}
+
+func (q *queue) Close() {
+	close(q.c)
+}
+
+func (q *queue) Read() (PacketInfo, bool) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	select {
+	case p := <-q.c:
+		q.numRead++
+		return p, true
+	default:
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) ReadContext(ctx context.Context) (PacketInfo, bool) {
+	// We have to receive from channel without holding the lock, since it can
+	// block indefinitely. This will cause a window that numWrite - numRead
+	// produces a larger number, but won't go to negative. numWrite >= numRead
+	// still holds.
+	select {
+	case pkt := <-q.c:
+		q.mu.Lock()
+		defer q.mu.Unlock()
+		q.numRead++
+		return pkt, true
+	case <-ctx.Done():
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) Write(p PacketInfo) bool {
+	wrote := false
+
+	// It's important to make sure nobody can see numWrite until we increment it,
+	// so numWrite >= numRead holds.
+	q.mu.Lock()
+	select {
+	case q.c <- p:
+		wrote = true
+		q.numWrite++
+	default:
+	}
+	notify := q.notify
+	q.mu.Unlock()
+
+	if wrote {
+		// Send notification outside of lock.
+		for _, h := range notify {
+			h.n.WriteNotify()
+		}
+	}
+	return wrote
+}
+
+func (q *queue) Num() int {
+	q.mu.RLock()
+	defer q.mu.RUnlock()
+	n := q.numWrite - q.numRead
+	if n < 0 {
+		panic("numWrite < numRead")
+	}
+	return n
+}
+
+func (q *queue) AddNotify(notify Notification) *NotificationHandle {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	h := &NotificationHandle{n: notify}
+	q.notify = append(q.notify, h)
+	return h
+}
+
+func (q *queue) RemoveNotify(handle *NotificationHandle) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	// Make a copy, since we reads the array outside of lock when notifying.
+	notify := make([]*NotificationHandle, 0, len(q.notify))
+	for _, h := range q.notify {
+		if h != handle {
+			notify = append(notify, h)
+		}
+	}
+	q.notify = notify
+}
+
 // Endpoint is link layer endpoint that stores outbound packets in a channel
 // and allows injection of inbound packets.
 type Endpoint struct {
@@ -41,14 +154,16 @@ type Endpoint struct {
 	linkAddr           tcpip.LinkAddress
 	LinkEPCapabilities stack.LinkEndpointCapabilities
 
-	// c is where outbound packets are queued.
-	c chan PacketInfo
+	// Outbound packet queue.
+	q *queue
 }
 
 // New creates a new channel endpoint.
 func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
 	return &Endpoint{
-		c:        make(chan PacketInfo, size),
+		q: &queue{
+			c: make(chan PacketInfo, size),
+		},
 		mtu:      mtu,
 		linkAddr: linkAddr,
 	}
@@ -57,43 +172,36 @@ func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
 // Close closes e. Further packet injections will panic. Reads continue to
 // succeed until all packets are read.
 func (e *Endpoint) Close() {
-	close(e.c)
+	e.q.Close()
 }
 
-// Read does non-blocking read for one packet from the outbound packet queue.
+// Read does non-blocking read one packet from the outbound packet queue.
 func (e *Endpoint) Read() (PacketInfo, bool) {
-	select {
-	case pkt := <-e.c:
-		return pkt, true
-	default:
-		return PacketInfo{}, false
-	}
+	return e.q.Read()
 }
 
 // ReadContext does blocking read for one packet from the outbound packet queue.
 // It can be cancelled by ctx, and in this case, it returns false.
 func (e *Endpoint) ReadContext(ctx context.Context) (PacketInfo, bool) {
-	select {
-	case pkt := <-e.c:
-		return pkt, true
-	case <-ctx.Done():
-		return PacketInfo{}, false
-	}
+	return e.q.ReadContext(ctx)
 }
 
 // Drain removes all outbound packets from the channel and counts them.
 func (e *Endpoint) Drain() int {
 	c := 0
 	for {
-		select {
-		case <-e.c:
-			c++
-		default:
+		if _, ok := e.Read(); !ok {
 			return c
 		}
+		c++
 	}
 }
 
+// NumQueued returns the number of packet queued for outbound.
+func (e *Endpoint) NumQueued() int {
+	return e.q.Num()
+}
+
 // InjectInbound injects an inbound packet.
 func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
@@ -155,10 +263,7 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 		Route: route,
 	}
 
-	select {
-	case e.c <- p:
-	default:
-	}
+	e.q.Write(p)
 
 	return nil
 }
@@ -171,7 +276,6 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
 	route.Release()
 	payloadView := pkts[0].Data.ToView()
 	n := 0
-packetLoop:
 	for _, pkt := range pkts {
 		off := pkt.DataOffset
 		size := pkt.DataSize
@@ -185,12 +289,10 @@ packetLoop:
 			Route: route,
 		}
 
-		select {
-		case e.c <- p:
-			n++
-		default:
-			break packetLoop
+		if !e.q.Write(p) {
+			break
 		}
+		n++
 	}
 
 	return n, nil
@@ -204,13 +306,21 @@ func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 		GSO:   nil,
 	}
 
-	select {
-	case e.c <- p:
-	default:
-	}
+	e.q.Write(p)
 
 	return nil
 }
 
 // Wait implements stack.LinkEndpoint.Wait.
 func (*Endpoint) Wait() {}
+
+// AddNotify adds a notification target for receiving event about outgoing
+// packets.
+func (e *Endpoint) AddNotify(notify Notification) *NotificationHandle {
+	return e.q.AddNotify(notify)
+}
+
+// RemoveNotify removes handle from the list of notification targets.
+func (e *Endpoint) RemoveNotify(handle *NotificationHandle) {
+	e.q.RemoveNotify(handle)
+}
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index e5096ea38..e0db6cf54 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -4,6 +4,22 @@ package(licenses = ["notice"])
 
 go_library(
     name = "tun",
-    srcs = ["tun_unsafe.go"],
+    srcs = [
+        "device.go",
+        "protocol.go",
+        "tun_unsafe.go",
+    ],
     visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/refs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
 )
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
new file mode 100644
index 000000000..6ff47a742
--- /dev/null
+++ b/pkg/tcpip/link/tun/device.go
@@ -0,0 +1,352 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// drivers/net/tun.c:tun_net_init()
+	defaultDevMtu = 1500
+
+	// Queue length for outbound packet, arriving at fd side for read. Overflow
+	// causes packet drops. gVisor implementation-specific.
+	defaultDevOutQueueLen = 1024
+)
+
+var zeroMAC [6]byte
+
+// Device is an opened /dev/net/tun device.
+//
+// +stateify savable
+type Device struct {
+	waiter.Queue
+
+	mu           sync.RWMutex `state:"nosave"`
+	endpoint     *tunEndpoint
+	notifyHandle *channel.NotificationHandle
+	flags        uint16
+}
+
+// beforeSave is invoked by stateify.
+func (d *Device) beforeSave() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	// TODO(b/110961832): Restore the device to stack. At this moment, the stack
+	// is not savable.
+	if d.endpoint != nil {
+		panic("/dev/net/tun does not support save/restore when a device is associated with it.")
+	}
+}
+
+// Release implements fs.FileOperations.Release.
+func (d *Device) Release() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// Decrease refcount if there is an endpoint associated with this file.
+	if d.endpoint != nil {
+		d.endpoint.RemoveNotify(d.notifyHandle)
+		d.endpoint.DecRef()
+		d.endpoint = nil
+	}
+}
+
+// SetIff services TUNSETIFF ioctl(2) request.
+func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.endpoint != nil {
+		return syserror.EINVAL
+	}
+
+	// Input validations.
+	isTun := flags&linux.IFF_TUN != 0
+	isTap := flags&linux.IFF_TAP != 0
+	supportedFlags := uint16(linux.IFF_TUN | linux.IFF_TAP | linux.IFF_NO_PI)
+	if isTap && isTun || !isTap && !isTun || flags&^supportedFlags != 0 {
+		return syserror.EINVAL
+	}
+
+	prefix := "tun"
+	if isTap {
+		prefix = "tap"
+	}
+
+	endpoint, err := attachOrCreateNIC(s, name, prefix)
+	if err != nil {
+		return syserror.EINVAL
+	}
+
+	d.endpoint = endpoint
+	d.notifyHandle = d.endpoint.AddNotify(d)
+	d.flags = flags
+	return nil
+}
+
+func attachOrCreateNIC(s *stack.Stack, name, prefix string) (*tunEndpoint, error) {
+	for {
+		// 1. Try to attach to an existing NIC.
+		if name != "" {
+			if nic, found := s.GetNICByName(name); found {
+				endpoint, ok := nic.LinkEndpoint().(*tunEndpoint)
+				if !ok {
+					// Not a NIC created by tun device.
+					return nil, syserror.EOPNOTSUPP
+				}
+				if !endpoint.TryIncRef() {
+					// Race detected: NIC got deleted in between.
+					continue
+				}
+				return endpoint, nil
+			}
+		}
+
+		// 2. Creating a new NIC.
+		id := tcpip.NICID(s.UniqueID())
+		endpoint := &tunEndpoint{
+			Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""),
+			stack:    s,
+			nicID:    id,
+			name:     name,
+		}
+		if endpoint.name == "" {
+			endpoint.name = fmt.Sprintf("%s%d", prefix, id)
+		}
+		err := s.CreateNICWithOptions(endpoint.nicID, endpoint, stack.NICOptions{
+			Name: endpoint.name,
+		})
+		switch err {
+		case nil:
+			return endpoint, nil
+		case tcpip.ErrDuplicateNICID:
+			// Race detected: A NIC has been created in between.
+			continue
+		default:
+			return nil, syserror.EINVAL
+		}
+	}
+}
+
+// Write inject one inbound packet to the network interface.
+func (d *Device) Write(data []byte) (int64, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return 0, syserror.EBADFD
+	}
+	if !endpoint.IsAttached() {
+		return 0, syserror.EIO
+	}
+
+	dataLen := int64(len(data))
+
+	// Packet information.
+	var pktInfoHdr PacketInfoHeader
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		if len(data) < PacketInfoHeaderSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		pktInfoHdr = PacketInfoHeader(data[:PacketInfoHeaderSize])
+		data = data[PacketInfoHeaderSize:]
+	}
+
+	// Ethernet header (TAP only).
+	var ethHdr header.Ethernet
+	if d.hasFlags(linux.IFF_TAP) {
+		if len(data) < header.EthernetMinimumSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		ethHdr = header.Ethernet(data[:header.EthernetMinimumSize])
+		data = data[header.EthernetMinimumSize:]
+	}
+
+	// Try to determine network protocol number, default zero.
+	var protocol tcpip.NetworkProtocolNumber
+	switch {
+	case pktInfoHdr != nil:
+		protocol = pktInfoHdr.Protocol()
+	case ethHdr != nil:
+		protocol = ethHdr.Type()
+	}
+
+	// Try to determine remote link address, default zero.
+	var remote tcpip.LinkAddress
+	switch {
+	case ethHdr != nil:
+		remote = ethHdr.SourceAddress()
+	default:
+		remote = tcpip.LinkAddress(zeroMAC[:])
+	}
+
+	pkt := tcpip.PacketBuffer{
+		Data: buffer.View(data).ToVectorisedView(),
+	}
+	if ethHdr != nil {
+		pkt.LinkHeader = buffer.View(ethHdr)
+	}
+	endpoint.InjectLinkAddr(protocol, remote, pkt)
+	return dataLen, nil
+}
+
+// Read reads one outgoing packet from the network interface.
+func (d *Device) Read() ([]byte, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return nil, syserror.EBADFD
+	}
+
+	for {
+		info, ok := endpoint.Read()
+		if !ok {
+			return nil, syserror.ErrWouldBlock
+		}
+
+		v, ok := d.encodePkt(&info)
+		if !ok {
+			// Ignore unsupported packet.
+			continue
+		}
+		return v, nil
+	}
+}
+
+// encodePkt encodes packet for fd side.
+func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) {
+	var vv buffer.VectorisedView
+
+	// Packet information.
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		hdr := make(PacketInfoHeader, PacketInfoHeaderSize)
+		hdr.Encode(&PacketInfoFields{
+			Protocol: info.Proto,
+		})
+		vv.AppendView(buffer.View(hdr))
+	}
+
+	// If the packet does not already have link layer header, and the route
+	// does not exist, we can't compute it. This is possibly a raw packet, tun
+	// device doesn't support this at the moment.
+	if info.Pkt.LinkHeader == nil && info.Route.RemoteLinkAddress == "" {
+		return nil, false
+	}
+
+	// Ethernet header (TAP only).
+	if d.hasFlags(linux.IFF_TAP) {
+		// Add ethernet header if not provided.
+		if info.Pkt.LinkHeader == nil {
+			hdr := &header.EthernetFields{
+				SrcAddr: info.Route.LocalLinkAddress,
+				DstAddr: info.Route.RemoteLinkAddress,
+				Type:    info.Proto,
+			}
+			if hdr.SrcAddr == "" {
+				hdr.SrcAddr = d.endpoint.LinkAddress()
+			}
+
+			eth := make(header.Ethernet, header.EthernetMinimumSize)
+			eth.Encode(hdr)
+			vv.AppendView(buffer.View(eth))
+		} else {
+			vv.AppendView(info.Pkt.LinkHeader)
+		}
+	}
+
+	// Append upper headers.
+	vv.AppendView(buffer.View(info.Pkt.Header.View()[len(info.Pkt.LinkHeader):]))
+	// Append data payload.
+	vv.Append(info.Pkt.Data)
+
+	return vv.ToView(), true
+}
+
+// Name returns the name of the attached network interface. Empty string if
+// unattached.
+func (d *Device) Name() string {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	if d.endpoint != nil {
+		return d.endpoint.name
+	}
+	return ""
+}
+
+// Flags returns the flags set for d. Zero value if unset.
+func (d *Device) Flags() uint16 {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	return d.flags
+}
+
+func (d *Device) hasFlags(flags uint16) bool {
+	return d.flags&flags == flags
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (d *Device) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if mask&waiter.EventIn != 0 {
+		d.mu.RLock()
+		endpoint := d.endpoint
+		d.mu.RUnlock()
+		if endpoint != nil && endpoint.NumQueued() == 0 {
+			mask &= ^waiter.EventIn
+		}
+	}
+	return mask & (waiter.EventIn | waiter.EventOut)
+}
+
+// WriteNotify implements channel.Notification.WriteNotify.
+func (d *Device) WriteNotify() {
+	d.Notify(waiter.EventIn)
+}
+
+// tunEndpoint is the link endpoint for the NIC created by the tun device.
+//
+// It is ref-counted as multiple opening files can attach to the same NIC.
+// The last owner is responsible for deleting the NIC.
+type tunEndpoint struct {
+	*channel.Endpoint
+
+	refs.AtomicRefCount
+
+	stack *stack.Stack
+	nicID tcpip.NICID
+	name  string
+}
+
+// DecRef decrements refcount of e, removes NIC if refcount goes to 0.
+func (e *tunEndpoint) DecRef() {
+	e.DecRefWithDestructor(func() {
+		e.stack.RemoveNIC(e.nicID)
+	})
+}
diff --git a/pkg/tcpip/link/tun/protocol.go b/pkg/tcpip/link/tun/protocol.go
new file mode 100644
index 000000000..89d9d91a9
--- /dev/null
+++ b/pkg/tcpip/link/tun/protocol.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// PacketInfoHeaderSize is the size of the packet information header.
+	PacketInfoHeaderSize = 4
+
+	offsetFlags    = 0
+	offsetProtocol = 2
+)
+
+// PacketInfoFields contains fields sent through the wire if IFF_NO_PI flag is
+// not set.
+type PacketInfoFields struct {
+	Flags    uint16
+	Protocol tcpip.NetworkProtocolNumber
+}
+
+// PacketInfoHeader is the wire representation of the packet information sent if
+// IFF_NO_PI flag is not set.
+type PacketInfoHeader []byte
+
+// Encode encodes f into h.
+func (h PacketInfoHeader) Encode(f *PacketInfoFields) {
+	binary.BigEndian.PutUint16(h[offsetFlags:][:2], f.Flags)
+	binary.BigEndian.PutUint16(h[offsetProtocol:][:2], uint16(f.Protocol))
+}
+
+// Flags returns the flag field in h.
+func (h PacketInfoHeader) Flags() uint16 {
+	return binary.BigEndian.Uint16(h[offsetFlags:])
+}
+
+// Protocol returns the protocol field in h.
+func (h PacketInfoHeader) Protocol() tcpip.NetworkProtocolNumber {
+	return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(h[offsetProtocol:]))
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 862954ab2..46d3a6646 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -298,6 +298,33 @@ func (n *NIC) enable() *tcpip.Error {
 	return nil
 }
 
+// remove detaches NIC from the link endpoint, and marks existing referenced
+// network endpoints expired. This guarantees no packets between this NIC and
+// the network stack.
+func (n *NIC) remove() *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	// Detach from link endpoint, so no packet comes in.
+	n.linkEP.Attach(nil)
+
+	// Remove permanent and permanentTentative addresses, so no packet goes out.
+	var errs []*tcpip.Error
+	for nid, ref := range n.mu.endpoints {
+		switch ref.getKind() {
+		case permanentTentative, permanent:
+			if err := n.removePermanentAddressLocked(nid.LocalAddress); err != nil {
+				errs = append(errs, err)
+			}
+		}
+	}
+	if len(errs) > 0 {
+		return errs[0]
+	}
+
+	return nil
+}
+
 // becomeIPv6Router transitions n into an IPv6 router.
 //
 // When transitioning into an IPv6 router, host-only state (NDP discovered
@@ -1302,6 +1329,11 @@ func (n *NIC) Stack() *Stack {
 	return n.stack
 }
 
+// LinkEndpoint returns the link endpoint of n.
+func (n *NIC) LinkEndpoint() LinkEndpoint {
+	return n.linkEP
+}
+
 // isAddrTentative returns true if addr is tentative on n.
 //
 // Note that if addr is not associated with n, then this function will return
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index f0ed76fbe..900dd46c5 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -916,6 +916,18 @@ func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
 	return s.CreateNICWithOptions(id, ep, NICOptions{})
 }
 
+// GetNICByName gets the NIC specified by name.
+func (s *Stack) GetNICByName(name string) (*NIC, bool) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	for _, nic := range s.nics {
+		if nic.Name() == name {
+			return nic, true
+		}
+	}
+	return nil, false
+}
+
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
 // delivering packets to it.
 func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
@@ -956,6 +968,33 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	return nic.enabled()
 }
 
+// RemoveNIC removes NIC and all related routes from the network stack.
+func (s *Stack) RemoveNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+	delete(s.nics, id)
+
+	// Remove routes in-place. n tracks the number of routes written.
+	n := 0
+	for i, r := range s.routeTable {
+		if r.NIC != id {
+			// Keep this route.
+			if i > n {
+				s.routeTable[n] = r
+			}
+			n++
+		}
+	}
+	s.routeTable = s.routeTable[:n]
+
+	return nic.remove()
+}
+
 // NICAddressRanges returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index d1977d4de..3518e862d 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -678,6 +678,8 @@ syscall_test(
     test = "//test/syscalls/linux:truncate_test",
 )
 
+syscall_test(test = "//test/syscalls/linux:tuntap_test")
+
 syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
 syscall_test(
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index aa303af84..704bae17b 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -131,6 +131,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "socket_netlink_route_util",
+    testonly = 1,
+    srcs = ["socket_netlink_route_util.cc"],
+    hdrs = ["socket_netlink_route_util.h"],
+    deps = [
+        ":socket_netlink_util",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 cc_library(
     name = "socket_test_util",
     testonly = 1,
@@ -3430,6 +3441,25 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "tuntap_test",
+    testonly = 1,
+    srcs = ["tuntap.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        gtest,
+        "//test/syscalls/linux:socket_netlink_route_util",
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:posix_error",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "udp_socket_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
index 4dd302eed..4e473268c 100644
--- a/test/syscalls/linux/dev.cc
+++ b/test/syscalls/linux/dev.cc
@@ -153,6 +153,13 @@ TEST(DevTest, TTYExists) {
   EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
 }
 
+TEST(DevTest, NetTunExists) {
+  struct stat statbuf = {};
+  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallSucceeds());
+  // Check that it's a character device with rw-rw-rw- permissions.
+  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
+}
+
 }  // namespace
 }  // namespace testing
 
diff --git a/test/syscalls/linux/socket_netlink_route_util.cc b/test/syscalls/linux/socket_netlink_route_util.cc
new file mode 100644
index 000000000..53eb3b6b2
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.cc
@@ -0,0 +1,163 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+
+#include <linux/if.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include "absl/types/optional.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr uint32_t kSeq = 12345;
+
+}  // namespace
+
+PosixError DumpLinks(
+    const FileDescriptor& fd, uint32_t seq,
+    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = seq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
+}
+
+PosixErrorOr<std::vector<Link>> DumpLinks() {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  std::vector<Link> links;
+  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+    if (hdr->nlmsg_type != RTM_NEWLINK ||
+        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
+      return;
+    }
+    const struct ifinfomsg* msg =
+        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+    const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+    if (rta == nullptr) {
+      // Ignore links that do not have a name.
+      return;
+    }
+
+    links.emplace_back();
+    links.back().index = msg->ifi_index;
+    links.back().type = msg->ifi_type;
+    links.back().name =
+        std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
+  }));
+  return links;
+}
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  for (const auto& link : links) {
+    if (link.type == ARPHRD_LOOPBACK) {
+      return absl::optional<Link>(link);
+    }
+  }
+  return absl::optional<Link>();
+}
+
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifaddrmsg ifaddr;
+    char attrbuf[512];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifaddr));
+  req.hdr.nlmsg_type = RTM_NEWADDR;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifaddr.ifa_index = index;
+  req.ifaddr.ifa_family = family;
+  req.ifaddr.ifa_prefixlen = prefixlen;
+
+  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+  rta->rta_type = IFA_LOCAL;
+  rta->rta_len = RTA_LENGTH(addrlen);
+  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+  memcpy(RTA_DATA(rta), addr, addrlen);
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifinfo;
+    char pad[NLMSG_ALIGNTO];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+  req.hdr.nlmsg_type = RTM_NEWLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifinfo.ifi_index = index;
+  req.ifinfo.ifi_flags = flags;
+  req.ifinfo.ifi_change = change;
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen) {
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifinfo;
+    char attrbuf[512];
+  };
+
+  struct request req = {};
+  req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+  req.hdr.nlmsg_type = RTM_NEWLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifinfo.ifi_index = index;
+
+  struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+      reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+  rta->rta_type = IFLA_ADDRESS;
+  rta->rta_len = RTA_LENGTH(addrlen);
+  req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+  memcpy(RTA_DATA(rta), addr, addrlen);
+
+  return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_route_util.h b/test/syscalls/linux/socket_netlink_route_util.h
new file mode 100644
index 000000000..2c018e487
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.h
@@ -0,0 +1,55 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+
+struct Link {
+  int index;
+  int16_t type;
+  std::string name;
+};
+
+PosixError DumpLinks(const FileDescriptor& fd, uint32_t seq,
+                     const std::function<void(const struct nlmsghdr* hdr)>& fn);
+
+PosixErrorOr<std::vector<Link>> DumpLinks();
+
+PosixErrorOr<absl::optional<Link>> FindLoopbackLink();
+
+// LinkAddLocalAddr sets IFA_LOCAL attribute on the interface.
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+                            const void* addr, int addrlen);
+
+// LinkChangeFlags changes interface flags. E.g. IFF_UP.
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change);
+
+// LinkSetMacAddr sets IFLA_ADDRESS attribute of the interface.
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
new file mode 100644
index 000000000..f6ac9d7b8
--- /dev/null
+++ b/test/syscalls/linux/tuntap.cc
@@ -0,0 +1,346 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_tun.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr int kIPLen = 4;
+
+constexpr const char kDevNetTun[] = "/dev/net/tun";
+constexpr const char kTapName[] = "tap0";
+
+constexpr const uint8_t kMacA[ETH_ALEN] = {0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA};
+constexpr const uint8_t kMacB[ETH_ALEN] = {0xBB, 0xBB, 0xBB, 0xBB, 0xBB, 0xBB};
+
+PosixErrorOr<std::set<std::string>> DumpLinkNames() {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  std::set<std::string> names;
+  for (const auto& link : links) {
+    names.emplace(link.name);
+  }
+  return names;
+}
+
+PosixErrorOr<absl::optional<Link>> GetLinkByName(const std::string& name) {
+  ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+  for (const auto& link : links) {
+    if (link.name == name) {
+      return absl::optional<Link>(link);
+    }
+  }
+  return absl::optional<Link>();
+}
+
+struct pihdr {
+  uint16_t pi_flags;
+  uint16_t pi_protocol;
+} __attribute__((packed));
+
+struct ping_pkt {
+  pihdr pi;
+  struct ethhdr eth;
+  struct iphdr ip;
+  struct icmphdr icmp;
+  char payload[64];
+} __attribute__((packed));
+
+ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+                          const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+  ping_pkt pkt = {};
+
+  pkt.pi.pi_protocol = htons(ETH_P_IP);
+
+  memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest));
+  memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source));
+  pkt.eth.h_proto = htons(ETH_P_IP);
+
+  pkt.ip.ihl = 5;
+  pkt.ip.version = 4;
+  pkt.ip.tos = 0;
+  pkt.ip.tot_len = htons(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+                         sizeof(pkt.payload));
+  pkt.ip.id = 1;
+  pkt.ip.frag_off = 1 << 6;  // Do not fragment
+  pkt.ip.ttl = 64;
+  pkt.ip.protocol = IPPROTO_ICMP;
+  inet_pton(AF_INET, dstip, &pkt.ip.daddr);
+  inet_pton(AF_INET, srcip, &pkt.ip.saddr);
+  pkt.ip.check = IPChecksum(pkt.ip);
+
+  pkt.icmp.type = ICMP_ECHO;
+  pkt.icmp.code = 0;
+  pkt.icmp.checksum = 0;
+  pkt.icmp.un.echo.sequence = 1;
+  pkt.icmp.un.echo.id = 1;
+
+  strncpy(pkt.payload, "abcd", sizeof(pkt.payload));
+  pkt.icmp.checksum = ICMPChecksum(pkt.icmp, pkt.payload, sizeof(pkt.payload));
+
+  return pkt;
+}
+
+struct arp_pkt {
+  pihdr pi;
+  struct ethhdr eth;
+  struct arphdr arp;
+  uint8_t arp_sha[ETH_ALEN];
+  uint8_t arp_spa[kIPLen];
+  uint8_t arp_tha[ETH_ALEN];
+  uint8_t arp_tpa[kIPLen];
+} __attribute__((packed));
+
+std::string CreateArpPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+                            const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+  std::string buffer;
+  buffer.resize(sizeof(arp_pkt));
+
+  arp_pkt* pkt = reinterpret_cast<arp_pkt*>(&buffer[0]);
+  {
+    pkt->pi.pi_protocol = htons(ETH_P_ARP);
+
+    memcpy(pkt->eth.h_dest, kMacA, sizeof(pkt->eth.h_dest));
+    memcpy(pkt->eth.h_source, kMacB, sizeof(pkt->eth.h_source));
+    pkt->eth.h_proto = htons(ETH_P_ARP);
+
+    pkt->arp.ar_hrd = htons(ARPHRD_ETHER);
+    pkt->arp.ar_pro = htons(ETH_P_IP);
+    pkt->arp.ar_hln = ETH_ALEN;
+    pkt->arp.ar_pln = kIPLen;
+    pkt->arp.ar_op = htons(ARPOP_REPLY);
+
+    memcpy(pkt->arp_sha, srcmac, sizeof(pkt->arp_sha));
+    inet_pton(AF_INET, srcip, pkt->arp_spa);
+    memcpy(pkt->arp_tha, dstmac, sizeof(pkt->arp_tha));
+    inet_pton(AF_INET, dstip, pkt->arp_tpa);
+  }
+  return buffer;
+}
+
+}  // namespace
+
+class TuntapTest : public ::testing::Test {
+ protected:
+  void TearDown() override {
+    if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))) {
+      // Bring back capability if we had dropped it in test case.
+      ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, true));
+    }
+  }
+};
+
+TEST_F(TuntapTest, CreateInterfaceNoCap) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, false));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  strncpy(ifr.ifr_name, kTapName, IFNAMSIZ);
+
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallFailsWithErrno(EPERM));
+}
+
+TEST_F(TuntapTest, CreateFixedNameInterface) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr_set = {};
+  ifr_set.ifr_flags = IFF_TAP;
+  strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
+              SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_get = {};
+  EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+              SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_expect = ifr_set;
+  // See __tun_chr_ioctl() in net/drivers/tun.c.
+  ifr_expect.ifr_flags |= IFF_NOFILTER;
+
+  EXPECT_THAT(DumpLinkNames(),
+              IsPosixErrorOkAndHolds(::testing::Contains(kTapName)));
+  EXPECT_THAT(memcmp(&ifr_expect, &ifr_get, sizeof(ifr_get)), ::testing::Eq(0));
+}
+
+TEST_F(TuntapTest, CreateInterface) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  // Empty ifr.ifr_name. Let kernel assign.
+
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+  struct ifreq ifr_get = {};
+  EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+              SyscallSucceedsWithValue(0));
+
+  std::string ifname = ifr_get.ifr_name;
+  EXPECT_THAT(ifname, ::testing::StartsWith("tap"));
+  EXPECT_THAT(DumpLinkNames(),
+              IsPosixErrorOkAndHolds(::testing::Contains(ifname)));
+}
+
+TEST_F(TuntapTest, InvalidReadWrite) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  char buf[128] = {};
+  EXPECT_THAT(read(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+  EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+}
+
+TEST_F(TuntapTest, WriteToDownDevice) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // FIXME: gVisor always creates enabled/up'd interfaces.
+  SKIP_IF(IsRunningOnGvisor());
+
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  // Device created should be down by default.
+  struct ifreq ifr = {};
+  ifr.ifr_flags = IFF_TAP;
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+  char buf[128] = {};
+  EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EIO));
+}
+
+// This test sets up a TAP device and pings kernel by sending ICMP echo request.
+//
+// It works as the following:
+// * Open /dev/net/tun, and create kTapName interface.
+// * Use rtnetlink to do initial setup of the interface:
+//   * Assign IP address 10.0.0.1/24 to kernel.
+//   * MAC address: kMacA
+//   * Bring up the interface.
+// * Send an ICMP echo reqest (ping) packet from 10.0.0.2 (kMacB) to kernel.
+// * Loop to receive packets from TAP device/fd:
+//   * If packet is an ICMP echo reply, it stops and passes the test.
+//   * If packet is an ARP request, it responds with canned reply and resends
+//   the
+//     ICMP request packet.
+TEST_F(TuntapTest, PingKernel) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  // Interface creation.
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+  struct ifreq ifr_set = {};
+  ifr_set.ifr_flags = IFF_TAP;
+  strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
+  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
+              SyscallSucceedsWithValue(0));
+
+  absl::optional<Link> link =
+      ASSERT_NO_ERRNO_AND_VALUE(GetLinkByName(kTapName));
+  ASSERT_TRUE(link.has_value());
+
+  // Interface setup.
+  struct in_addr addr;
+  inet_pton(AF_INET, "10.0.0.1", &addr);
+  EXPECT_NO_ERRNO(LinkAddLocalAddr(link->index, AF_INET, /*prefixlen=*/24,
+                                   &addr, sizeof(addr)));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME: gVisor doesn't support setting MAC address on interfaces yet.
+    EXPECT_NO_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
+
+    // FIXME: gVisor always creates enabled/up'd interfaces.
+    EXPECT_NO_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
+  }
+
+  ping_pkt ping_req = CreatePingPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+  std::string arp_rep = CreateArpPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+
+  // Send ping, this would trigger an ARP request on Linux.
+  EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+              SyscallSucceedsWithValue(sizeof(ping_req)));
+
+  // Receive loop to process inbound packets.
+  struct inpkt {
+    union {
+      pihdr pi;
+      ping_pkt ping;
+      arp_pkt arp;
+    };
+  };
+  while (1) {
+    inpkt r = {};
+    int n = read(fd.get(), &r, sizeof(r));
+    EXPECT_THAT(n, SyscallSucceeds());
+
+    if (n < sizeof(pihdr)) {
+      std::cerr << "Ignored packet, protocol: " << r.pi.pi_protocol
+                << " len: " << n << std::endl;
+      continue;
+    }
+
+    // Process ARP packet.
+    if (n >= sizeof(arp_pkt) && r.pi.pi_protocol == htons(ETH_P_ARP)) {
+      // Respond with canned ARP reply.
+      EXPECT_THAT(write(fd.get(), arp_rep.data(), arp_rep.size()),
+                  SyscallSucceedsWithValue(arp_rep.size()));
+      // First ping request might have been dropped due to mac address not in
+      // ARP cache. Send it again.
+      EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+                  SyscallSucceedsWithValue(sizeof(ping_req)));
+    }
+
+    // Process ping response packet.
+    if (n >= sizeof(ping_pkt) && r.pi.pi_protocol == ping_req.pi.pi_protocol &&
+        r.ping.ip.protocol == ping_req.ip.protocol &&
+        !memcmp(&r.ping.ip.saddr, &ping_req.ip.daddr, kIPLen) &&
+        !memcmp(&r.ping.ip.daddr, &ping_req.ip.saddr, kIPLen) &&
+        r.ping.icmp.type == 0 && r.ping.icmp.code == 0) {
+      // Ends and passes the test.
+      break;
+    }
+  }
+}
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 75d7f76a6cd81d77f5ce70440c1d95c0296b15ba Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Mon, 11 Nov 2019 20:26:38 -0800
Subject: arm64: add a travis build ci

Build runsc and run "runsc do ls".

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 .travis.yml                                  | 19 ++++++++++++++++++
 Dockerfile                                   | 11 ++++++-----
 Makefile                                     |  5 ++++-
 test/syscalls/linux/32bit.cc                 |  2 +-
 test/syscalls/linux/rseq/uapi.h              | 29 ++++++++++++----------------
 test/syscalls/linux/udp_socket_test_cases.cc |  4 ++++
 6 files changed, 46 insertions(+), 24 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/.travis.yml b/.travis.yml
index e69de29bb..a2a260538 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -0,0 +1,19 @@
+language: minimal
+sudo: required
+dist: xenial
+cache:
+  directories:
+    - /home/travis/.cache/bazel/
+services:
+  - docker
+matrix:
+  include:
+   - os: linux
+     arch: amd64
+     env: RUNSC_PATH=./bazel-bin/runsc/linux_amd64_pure_stripped/runsc
+   - os: linux
+     arch: arm64
+     env: RUNSC_PATH=./bazel-bin/runsc/linux_arm64_pure_stripped/runsc
+script:
+   - uname -a
+   - make DOCKER_RUN_OPTIONS="" BAZEL_OPTIONS="build runsc:runsc" bazel && $RUNSC_PATH --alsologtostderr --network none --debug --TESTONLY-unsafe-nonroot=true --rootless do ls
diff --git a/Dockerfile b/Dockerfile
index 738623023..2bfdfec6c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,9 @@
-FROM ubuntu:bionic
+FROM fedora:31
 
-RUN apt-get update && apt-get install -y curl gnupg2 git python python3 python3-distutils python3-pip
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
-RUN apt-get update && apt-get install -y bazel && apt-get clean
+RUN  dnf install -y dnf-plugins-core && dnf copr enable -y vbatts/bazel
+
+RUN dnf install -y bazel2 git gcc make golang gcc-c++ glibc-devel python3 which python3-pip python3-devel libffi-devel openssl-devel pkg-config glibc-static
+
+RUN pip install pycparser
 
 WORKDIR /gvisor
diff --git a/Makefile b/Makefile
index a73bc0c36..d9531fbd5 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,9 @@ UID := $(shell id -u ${USER})
 GID := $(shell id -g ${USER})
 GVISOR_BAZEL_CACHE := $(shell readlink -f ~/.cache/bazel/)
 
+# The  --privileged is required to run tests.
+DOCKER_RUN_OPTIONS ?= --privileged
+
 all: runsc
 
 docker-build:
@@ -19,7 +22,7 @@ bazel-server-start: docker-build
 		-v "$(CURDIR):$(CURDIR)" \
 		--workdir "$(CURDIR)" \
 		--tmpfs /tmp:rw,exec \
-		--privileged \
+		$(DOCKER_RUN_OPTIONS) \
 		gvisor-bazel \
 		sh -c "while :; do sleep 100; done" && \
 	docker exec --user 0:0 -i gvisor-bazel sh -c "groupadd --gid $(GID) --non-unique gvisor && useradd --uid $(UID) --non-unique --gid $(GID) -d $(HOME) gvisor"
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index c47a05181..3c825477c 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -74,7 +74,7 @@ void ExitGroup32(const char instruction[2], int code) {
       "int $3\n"
       :
       : [ code ] "m"(code), [ ip ] "d"(m.ptr())
-      : "rax", "rbx", "rsp");
+      : "rax", "rbx");
 }
 
 constexpr int kExitCode = 42;
diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h
index e3ff0579a..ca1d67691 100644
--- a/test/syscalls/linux/rseq/uapi.h
+++ b/test/syscalls/linux/rseq/uapi.h
@@ -15,14 +15,9 @@
 #ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
 #define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
 
-// User-kernel ABI for restartable sequences.
+#include <stdint.h>
 
-// Standard types.
-//
-// N.B. This header will be included in targets that do have the standard
-// library, so we can't shadow the standard type names.
-using __u32 = __UINT32_TYPE__;
-using __u64 = __UINT64_TYPE__;
+// User-kernel ABI for restartable sequences.
 
 #ifdef __x86_64__
 // Syscall numbers.
@@ -32,20 +27,20 @@ constexpr int kRseqSyscall = 334;
 #endif  // __x86_64__
 
 struct rseq_cs {
-  __u32 version;
-  __u32 flags;
-  __u64 start_ip;
-  __u64 post_commit_offset;
-  __u64 abort_ip;
-} __attribute__((aligned(4 * sizeof(__u64))));
+  uint32_t version;
+  uint32_t flags;
+  uint64_t start_ip;
+  uint64_t post_commit_offset;
+  uint64_t abort_ip;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
 
 // N.B. alignment is enforced by the kernel.
 struct rseq {
-  __u32 cpu_id_start;
-  __u32 cpu_id;
+  uint32_t cpu_id_start;
+  uint32_t cpu_id;
   struct rseq_cs* rseq_cs;
-  __u32 flags;
-} __attribute__((aligned(4 * sizeof(__u64))));
+  uint32_t flags;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
 
 constexpr int kRseqFlagUnregister = 1 << 0;
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 57b1a357c..740c7986d 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -21,6 +21,10 @@
 #include <sys/socket.h>
 #include <sys/types.h>
 
+#ifndef SIOCGSTAMP
+#include <linux/sockios.h>
+#endif
+
 #include "gtest/gtest.h"
 #include "absl/base/macros.h"
 #include "absl/time/clock.h"
-- 
cgit v1.2.3


From de0b2ebf8635a75bfabfd0a8b48de7923017574e Mon Sep 17 00:00:00 2001
From: Jay Zhuang <jayzhuang@google.com>
Date: Wed, 26 Feb 2020 18:16:19 -0800
Subject: Add getsockopt tests for SO_SNDTIMEO and SO_RCVTIMEO

PiperOrigin-RevId: 297485310
---
 test/syscalls/linux/socket_generic.cc | 96 ++++++++++++++++++++++++++++++++---
 1 file changed, 88 insertions(+), 8 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index e8f24a59e..f7d6139f1 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -447,6 +447,60 @@ TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgSucceeds) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
+TEST_P(AllSocketPairTest, SendTimeoutDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval actual_tv = {.tv_sec = -1, .tv_usec = -1};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv_usec, 0);
+}
+
+TEST_P(AllSocketPairTest, SetGetSendTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval tv = {.tv_sec = 89, .tv_usec = 42000};
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  timeval actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 89);
+  EXPECT_EQ(actual_tv.tv_usec, 42000);
+}
+
+TEST_P(AllSocketPairTest, SetGetSendTimeoutLargerArg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval_with_extra {
+    struct timeval tv;
+    int64_t extra_data;
+  } ABSL_ATTRIBUTE_PACKED;
+
+  timeval_with_extra tv_extra = {
+      .tv = {.tv_sec = 0, .tv_usec = 123000},
+  };
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &tv_extra, sizeof(tv_extra)),
+              SyscallSucceeds());
+
+  timeval_with_extra actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv.tv_usec, 123000);
+}
+
 TEST_P(AllSocketPairTest, SendTimeoutAllowsWrite) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -491,18 +545,36 @@ TEST_P(AllSocketPairTest, SendTimeoutAllowsSendmsg) {
   ASSERT_NO_FATAL_FAILURE(SendNullCmsg(sockets->first_fd(), buf, sizeof(buf)));
 }
 
-TEST_P(AllSocketPairTest, SoRcvTimeoIsSet) {
+TEST_P(AllSocketPairTest, RecvTimeoutDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
-  struct timeval tv {
-    .tv_sec = 0, .tv_usec = 35
-  };
+  timeval actual_tv = {.tv_sec = -1, .tv_usec = -1};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv_usec, 0);
+}
+
+TEST_P(AllSocketPairTest, SetGetRecvTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  timeval tv = {.tv_sec = 123, .tv_usec = 456000};
   EXPECT_THAT(
       setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
       SyscallSucceeds());
+
+  timeval actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv_sec, 123);
+  EXPECT_EQ(actual_tv.tv_usec, 456000);
 }
 
-TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
+TEST_P(AllSocketPairTest, SetGetRecvTimeoutLargerArg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct timeval_with_extra {
@@ -510,13 +582,21 @@ TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
     int64_t extra_data;
   } ABSL_ATTRIBUTE_PACKED;
 
-  timeval_with_extra tv_extra;
-  tv_extra.tv.tv_sec = 0;
-  tv_extra.tv.tv_usec = 25;
+  timeval_with_extra tv_extra = {
+      .tv = {.tv_sec = 0, .tv_usec = 432000},
+  };
 
   EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
                          &tv_extra, sizeof(tv_extra)),
               SyscallSucceeds());
+
+  timeval_with_extra actual_tv = {};
+  socklen_t len = sizeof(actual_tv);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+                         &actual_tv, &len),
+              SyscallSucceeds());
+  EXPECT_EQ(actual_tv.tv.tv_sec, 0);
+  EXPECT_EQ(actual_tv.tv.tv_usec, 432000);
 }
 
 TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgOneSecondSucceeds) {
-- 
cgit v1.2.3


From abf7ebcd38e8c2750f4542f29115140bb2b44a9b Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Thu, 27 Feb 2020 10:59:32 -0800
Subject: Internal change.

PiperOrigin-RevId: 297638665
---
 pkg/sentry/socket/netstack/netstack.go |  40 +++++++++--
 pkg/tcpip/transport/packet/endpoint.go |  21 +++++-
 test/syscalls/linux/packet_socket.cc   | 124 ++++++++++++++++++++++++++++++---
 3 files changed, 167 insertions(+), 18 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index e187276c5..48c268bfa 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -712,14 +712,40 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	addr, family, err := AddressAndFamily(sockaddr)
-	if err != nil {
-		return err
-	}
-	if err := s.checkFamily(family, true /* exact */); err != nil {
-		return err
+	family := usermem.ByteOrder.Uint16(sockaddr)
+	var addr tcpip.FullAddress
+
+	// Bind for AF_PACKET requires only family, protocol and ifindex.
+	// In function AddressAndFamily, we check the address length which is
+	// not needed for AF_PACKET bind.
+	if family == linux.AF_PACKET {
+		var a linux.SockAddrLink
+		if len(sockaddr) < sockAddrLinkSize {
+			return syserr.ErrInvalidArgument
+		}
+		binary.Unmarshal(sockaddr[:sockAddrLinkSize], usermem.ByteOrder, &a)
+
+		if a.Protocol != uint16(s.protocol) {
+			return syserr.ErrInvalidArgument
+		}
+
+		addr = tcpip.FullAddress{
+			NIC:  tcpip.NICID(a.InterfaceIndex),
+			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
+		}
+	} else {
+		var err *syserr.Error
+		addr, family, err = AddressAndFamily(sockaddr)
+		if err != nil {
+			return err
+		}
+
+		if err = s.checkFamily(family, true /* exact */); err != nil {
+			return err
+		}
+
+		addr = s.mapFamily(addr, family)
 	}
-	addr = s.mapFamily(addr, family)
 
 	// Issue the bind request to the endpoint.
 	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 5722815e9..09a1cd436 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -76,6 +76,7 @@ type endpoint struct {
 	sndBufSize int
 	closed     bool
 	stats      tcpip.TransportEndpointStats `state:"nosave"`
+	bound      bool
 }
 
 // NewEndpoint returns a new packet endpoint.
@@ -125,6 +126,7 @@ func (ep *endpoint) Close() {
 	}
 
 	ep.closed = true
+	ep.bound = false
 	ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 }
 
@@ -216,7 +218,24 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 	// sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex."
 	// - packet(7).
 
-	return tcpip.ErrNotSupported
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	if ep.bound {
+		return tcpip.ErrAlreadyBound
+	}
+
+	// Unregister endpoint with all the nics.
+	ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
+
+	// Bind endpoint to receive packets from specific interface.
+	if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil {
+		return err
+	}
+
+	ep.bound = true
+
+	return nil
 }
 
 // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index 92ae55eec..bc22de788 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <arpa/inet.h>
+#include <ifaddrs.h>
 #include <linux/capability.h>
 #include <linux/if_arp.h>
 #include <linux/if_packet.h>
@@ -163,16 +164,11 @@ int CookedPacketTest::GetLoopbackIndex() {
   return ifr.ifr_ifindex;
 }
 
-// Receive via a packet socket.
-TEST_P(CookedPacketTest, Receive) {
-  // Let's use a simple IP payload: a UDP datagram.
-  FileDescriptor udp_sock =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
-  SendUDPMessage(udp_sock.get());
-
+// Receive and verify the message via packet socket on interface.
+void ReceiveMessage(int sock, int ifindex) {
   // Wait for the socket to become readable.
   struct pollfd pfd = {};
-  pfd.fd = socket_;
+  pfd.fd = sock;
   pfd.events = POLLIN;
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 2000), SyscallSucceedsWithValue(1));
 
@@ -182,9 +178,10 @@ TEST_P(CookedPacketTest, Receive) {
   char buf[64];
   struct sockaddr_ll src = {};
   socklen_t src_len = sizeof(src);
-  ASSERT_THAT(recvfrom(socket_, buf, sizeof(buf), 0,
+  ASSERT_THAT(recvfrom(sock, buf, sizeof(buf), 0,
                        reinterpret_cast<struct sockaddr*>(&src), &src_len),
               SyscallSucceedsWithValue(packet_size));
+
   // sockaddr_ll ends with an 8 byte physical address field, but ethernet
   // addresses only use 6 bytes.  Linux used to return sizeof(sockaddr_ll)-2
   // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns
@@ -194,7 +191,7 @@ TEST_P(CookedPacketTest, Receive) {
   // TODO(b/129292371): Verify protocol once we return it.
   // Verify the source address.
   EXPECT_EQ(src.sll_family, AF_PACKET);
-  EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex());
+  EXPECT_EQ(src.sll_ifindex, ifindex);
   EXPECT_EQ(src.sll_halen, ETH_ALEN);
   // This came from the loopback device, so the address is all 0s.
   for (int i = 0; i < src.sll_halen; i++) {
@@ -222,6 +219,18 @@ TEST_P(CookedPacketTest, Receive) {
   EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0);
 }
 
+// Receive via a packet socket.
+TEST_P(CookedPacketTest, Receive) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  int loopback_index = GetLoopbackIndex();
+  ReceiveMessage(socket_, loopback_index);
+}
+
 // Send via a packet socket.
 TEST_P(CookedPacketTest, Send) {
   // TODO(b/129292371): Remove once we support packet socket writing.
@@ -313,6 +322,101 @@ TEST_P(CookedPacketTest, Send) {
   EXPECT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK));
 }
 
+// Bind and receive via packet socket.
+TEST_P(CookedPacketTest, BindReceive) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+  SendUDPMessage(udp_sock.get());
+
+  // Receive and verify the data.
+  ReceiveMessage(socket_, bind_addr.sll_ifindex);
+}
+
+// Double Bind socket.
+TEST_P(CookedPacketTest, DoubleBind) {
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = GetLoopbackIndex();
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Binding socket again should fail.
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+           sizeof(bind_addr)),
+      // Linux 4.09 returns EINVAL here, but some time before 4.19 it switched
+      // to EADDRINUSE.
+      AnyOf(SyscallFailsWithErrno(EADDRINUSE), SyscallFailsWithErrno(EINVAL)));
+}
+
+// Bind and verify we do not receive data on interface which is not bound
+TEST_P(CookedPacketTest, BindDrop) {
+  // Let's use a simple IP payload: a UDP datagram.
+  FileDescriptor udp_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+  struct ifaddrs* if_addr_list = nullptr;
+  auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); });
+
+  ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds());
+
+  // Get interface other than loopback.
+  struct ifreq ifr = {};
+  for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) {
+    if (strcmp(i->ifa_name, "lo") != 0) {
+      strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name));
+      break;
+    }
+  }
+
+  // Skip if no interface is available other than loopback.
+  if (strlen(ifr.ifr_name) == 0) {
+    GTEST_SKIP();
+  }
+
+  // Get interface index.
+  EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds());
+  EXPECT_NE(ifr.ifr_ifindex, 0);
+
+  // Bind to packet socket requires only family, protocol and ifindex.
+  struct sockaddr_ll bind_addr = {};
+  bind_addr.sll_family = AF_PACKET;
+  bind_addr.sll_protocol = htons(GetParam());
+  bind_addr.sll_ifindex = ifr.ifr_ifindex;
+
+  ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+                   sizeof(bind_addr)),
+              SyscallSucceeds());
+
+  // Send to loopback interface.
+  struct sockaddr_in dest = {};
+  dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  dest.sin_family = AF_INET;
+  dest.sin_port = kPort;
+  EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0,
+                     reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
+              SyscallSucceedsWithValue(sizeof(kMessage)));
+
+  // Wait and make sure the socket never receives any data.
+  struct pollfd pfd = {};
+  pfd.fd = socket_;
+  pfd.events = POLLIN;
+  EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
-- 
cgit v1.2.3


From dd1ed5c789ff72fd6bbacda0ff7c7acf9672d25a Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Fri, 28 Feb 2020 14:47:34 +0800
Subject: skip vsyscall test cases on Arm64

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/syscalls/linux/time.cc     | 2 ++
 test/syscalls/linux/vsyscall.cc | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index 1ccb95733..e75bba669 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -26,6 +26,7 @@ namespace {
 
 constexpr long kFudgeSeconds = 5;
 
+#if defined(__x86_64__) || defined(__i386__)
 // Mimics the time(2) wrapper from glibc prior to 2.15.
 time_t vsyscall_time(time_t* t) {
   constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
@@ -98,6 +99,7 @@ TEST(TimeTest, VsyscallGettimeofday_InvalidAddressSIGSEGV) {
                                     reinterpret_cast<struct timezone*>(0x1)),
               ::testing::KilledBySignal(SIGSEGV), "");
 }
+#endif
 
 }  // namespace
 
diff --git a/test/syscalls/linux/vsyscall.cc b/test/syscalls/linux/vsyscall.cc
index 2c2303358..ae4377108 100644
--- a/test/syscalls/linux/vsyscall.cc
+++ b/test/syscalls/linux/vsyscall.cc
@@ -24,6 +24,7 @@ namespace testing {
 
 namespace {
 
+#if defined(__x86_64__) || defined(__i386__)
 time_t vsyscall_time(time_t* t) {
   constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
   return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
@@ -37,6 +38,7 @@ TEST(VsyscallTest, VsyscallAlwaysAvailableOnGvisor) {
   time_t t;
   EXPECT_THAT(vsyscall_time(&t), SyscallSucceeds());
 }
+#endif
 
 }  // namespace
 
-- 
cgit v1.2.3


From 6b4d36e3253238dd72d0861ac1220d147e1de8dd Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 28 Feb 2020 10:37:52 -0800
Subject: Hide /dev/net/tun when using hostinet.

/dev/net/tun does not currently work with hostinet. This has caused some
program starts failing because it thinks the feature exists.

PiperOrigin-RevId: 297876196
---
 pkg/sentry/fs/dev/BUILD                |  1 +
 pkg/sentry/fs/dev/dev.go               |  7 +++++--
 pkg/sentry/fs/dev/net_tun.go           |  7 +++++++
 pkg/sentry/kernel/kernel.go            |  4 ++++
 test/syscalls/BUILD                    |  5 +++++
 test/syscalls/linux/BUILD              | 12 +++++++++++
 test/syscalls/linux/dev.cc             |  7 -------
 test/syscalls/linux/tuntap.cc          |  7 +++++++
 test/syscalls/linux/tuntap_hostinet.cc | 37 ++++++++++++++++++++++++++++++++++
 9 files changed, 78 insertions(+), 9 deletions(-)
 create mode 100644 test/syscalls/linux/tuntap_hostinet.cc

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 9b6bb26d0..9379a4d7b 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -26,6 +26,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 7e66c29b0..acbd401a0 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -124,10 +125,12 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		"ptmx": newSymlink(ctx, "pts/ptmx", msrc),
 
 		"tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor),
+	}
 
-		"net": newDirectory(ctx, map[string]*fs.Inode{
+	if isNetTunSupported(inet.StackFromContext(ctx)) {
+		contents["net"] = newDirectory(ctx, map[string]*fs.Inode{
 			"tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor),
-		}, msrc),
+		}, msrc)
 	}
 
 	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go
index 755644488..dc7ad075a 100644
--- a/pkg/sentry/fs/dev/net_tun.go
+++ b/pkg/sentry/fs/dev/net_tun.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -168,3 +169,9 @@ func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.Eve
 func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) {
 	fops.device.EventUnregister(e)
 }
+
+// isNetTunSupported returns whether /dev/net/tun device is supported for s.
+func isNetTunSupported(s inet.Stack) bool {
+	_, ok := s.(*netstack.Stack)
+	return ok
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 8b76750e9..1d627564f 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -755,6 +755,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
@@ -1481,6 +1483,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 3518e862d..a69b0ce13 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -680,6 +680,11 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:tuntap_test")
 
+syscall_test(
+    add_hostinet = True,
+    test = "//test/syscalls/linux:tuntap_hostinet_test",
+)
+
 syscall_test(test = "//test/syscalls/linux:udp_bind_test")
 
 syscall_test(
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 704bae17b..70c120e42 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3460,6 +3460,18 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "tuntap_hostinet_test",
+    testonly = 1,
+    srcs = ["tuntap_hostinet.cc"],
+    linkstatic = 1,
+    deps = [
+        gtest,
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_library(
     name = "udp_socket_test_cases",
     testonly = 1,
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
index 4e473268c..4dd302eed 100644
--- a/test/syscalls/linux/dev.cc
+++ b/test/syscalls/linux/dev.cc
@@ -153,13 +153,6 @@ TEST(DevTest, TTYExists) {
   EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
 }
 
-TEST(DevTest, NetTunExists) {
-  struct stat statbuf = {};
-  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallSucceeds());
-  // Check that it's a character device with rw-rw-rw- permissions.
-  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
-}
-
 }  // namespace
 }  // namespace testing
 
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index f6ac9d7b8..f734511d6 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -153,6 +153,13 @@ std::string CreateArpPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
 
 }  // namespace
 
+TEST(TuntapStaticTest, NetTunExists) {
+  struct stat statbuf;
+  ASSERT_THAT(stat(kDevNetTun, &statbuf), SyscallSucceeds());
+  // Check that it's a character device with rw-rw-rw- permissions.
+  EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
+}
+
 class TuntapTest : public ::testing::Test {
  protected:
   void TearDown() override {
diff --git a/test/syscalls/linux/tuntap_hostinet.cc b/test/syscalls/linux/tuntap_hostinet.cc
new file mode 100644
index 000000000..0c527419e
--- /dev/null
+++ b/test/syscalls/linux/tuntap_hostinet.cc
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(TuntapHostInetTest, NoNetTun) {
+  SKIP_IF(!IsRunningOnGvisor());
+
+  struct stat statbuf;
+  ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallFailsWithErrno(ENOENT));
+}
+
+}  // namespace
+}  // namespace testing
+
+}  // namespace gvisor
-- 
cgit v1.2.3


From 36b193b1db60cad3c1c65ce3abef03a6a0594e3e Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Mon, 2 Mar 2020 07:13:47 +0000
Subject: Fix syscall test build error on arm64.

The error was introduced in the merge of PR #1471.
Some codes are missing when adding bazel select_arch
command to the test/syscall/linux/BUILD file.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I8cae3f4ae78c2e14671f3ac6e7361dc2806d9305
---
 test/syscalls/linux/BUILD | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 70c120e42..9ab13ba07 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -608,7 +608,10 @@ cc_binary(
 cc_binary(
     name = "exceptions_test",
     testonly = 1,
-    srcs = ["exceptions.cc"],
+    srcs = select_arch(
+	amd64 = ["exceptions.cc"],
+        arm64 = [],
+    ),
     linkstatic = 1,
     deps = [
         gtest,
@@ -1475,7 +1478,10 @@ cc_binary(
 cc_binary(
     name = "arch_prctl_test",
     testonly = 1,
-    srcs = ["arch_prctl.cc"],
+    srcs = select_arch(
+        amd64 = ["arch_prctl.cc"],
+        arm64 = [],
+    ),
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
@@ -3322,7 +3328,10 @@ cc_binary(
 cc_binary(
     name = "sysret_test",
     testonly = 1,
-    srcs = ["sysret.cc"],
+    srcs = select_arch(
+        amd64 = ["sysret.cc"],
+        arm64 = [],
+    ),
     linkstatic = 1,
     deps = [
         gtest,
-- 
cgit v1.2.3


From 33101752501fafea99d77f34bbd65f3e0083d22e Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Mon, 2 Mar 2020 14:43:52 -0800
Subject: Fix data-race when reading/writing e.amss.

PiperOrigin-RevId: 298451319
---
 pkg/tcpip/transport/tcp/connect.go  | 11 +++++++++--
 pkg/tcpip/transport/tcp/endpoint.go | 29 ++++++++++++++++++-----------
 test/syscalls/linux/tcp_socket.cc   | 15 +++++++++++++++
 3 files changed, 42 insertions(+), 13 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index cd247f3e1..ae4f3f3a9 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -295,6 +295,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	h.state = handshakeSynRcvd
 	h.ep.mu.Lock()
 	ttl := h.ep.ttl
+	amss := h.ep.amss
 	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
 	synOpts := header.TCPSynOptions{
@@ -307,7 +308,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 		// permits SACK. This is not explicitly defined in the RFC but
 		// this is the behaviour implemented by Linux.
 		SACKPermitted: rcvSynOpts.SACKPermitted,
-		MSS:           h.ep.amss,
+		MSS:           amss,
 	}
 	if ttl == 0 {
 		ttl = s.route.DefaultTTL()
@@ -356,6 +357,10 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			return tcpip.ErrInvalidEndpointState
 		}
 
+		h.ep.mu.RLock()
+		amss := h.ep.amss
+		h.ep.mu.RUnlock()
+
 		h.resetState()
 		synOpts := header.TCPSynOptions{
 			WS:            h.rcvWndScale,
@@ -363,7 +368,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			TSVal:         h.ep.timestamp(),
 			TSEcr:         h.ep.recentTimestamp(),
 			SACKPermitted: h.ep.sackPermitted,
-			MSS:           h.ep.amss,
+			MSS:           amss,
 		}
 		h.ep.sendSynTCP(&s.route, h.ep.ID, h.ep.ttl, h.ep.sendTOS, h.flags, h.iss, h.ackNum, h.rcvWnd, synOpts)
 		return nil
@@ -530,6 +535,7 @@ func (h *handshake) execute() *tcpip.Error {
 
 	// Send the initial SYN segment and loop until the handshake is
 	// completed.
+	h.ep.mu.Lock()
 	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
 
 	synOpts := header.TCPSynOptions{
@@ -540,6 +546,7 @@ func (h *handshake) execute() *tcpip.Error {
 		SACKPermitted: bool(sackEnabled),
 		MSS:           h.ep.amss,
 	}
+	h.ep.mu.Unlock()
 
 	// Execute is also called in a listen context so we want to make sure we
 	// only send the TS/SACK option when we received the TS/SACK in the
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9e72730bd..8b9154e69 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -959,15 +959,18 @@ func (e *endpoint) initialReceiveWindow() int {
 // ModerateRecvBuf adjusts the receive buffer and the advertised window
 // based on the number of bytes copied to user space.
 func (e *endpoint) ModerateRecvBuf(copied int) {
+	e.mu.RLock()
 	e.rcvListMu.Lock()
 	if e.rcvAutoParams.disabled {
 		e.rcvListMu.Unlock()
+		e.mu.RUnlock()
 		return
 	}
 	now := time.Now()
 	if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt {
 		e.rcvAutoParams.copied += copied
 		e.rcvListMu.Unlock()
+		e.mu.RUnlock()
 		return
 	}
 	prevRTTCopied := e.rcvAutoParams.copied + copied
@@ -1008,7 +1011,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 			e.rcvBufSize = rcvWnd
 			availAfter := e.receiveBufferAvailableLocked()
 			mask := uint32(notifyReceiveWindowChanged)
-			if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
+			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 				mask |= notifyNonZeroReceiveWindow
 			}
 			e.notifyProtocolGoroutine(mask)
@@ -1023,6 +1026,7 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 	e.rcvAutoParams.measureTime = now
 	e.rcvAutoParams.copied = 0
 	e.rcvListMu.Unlock()
+	e.mu.RUnlock()
 }
 
 // IPTables implements tcpip.Endpoint.IPTables.
@@ -1052,7 +1056,6 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 	v, err := e.readLocked()
 	e.rcvListMu.Unlock()
-
 	e.mu.RUnlock()
 
 	if err == tcpip.ErrClosedForReceive {
@@ -1085,7 +1088,7 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	// enough buffer space, to either fit an aMSS or half a receive buffer
 	// (whichever smaller), then notify the protocol goroutine to send a
 	// window update.
-	if crossed, above := e.windowCrossedACKThreshold(len(v)); crossed && above {
+	if crossed, above := e.windowCrossedACKThresholdLocked(len(v)); crossed && above {
 		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 	}
 
@@ -1303,9 +1306,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	return num, tcpip.ControlMessages{}, nil
 }
 
-// windowCrossedACKThreshold checks if the receive window to be announced now
-// would be under aMSS or under half receive buffer, whichever smaller. This is
-// useful as a receive side silly window syndrome prevention mechanism. If
+// windowCrossedACKThresholdLocked checks if the receive window to be announced
+// now would be under aMSS or under half receive buffer, whichever smaller. This
+// is useful as a receive side silly window syndrome prevention mechanism. If
 // window grows to reasonable value, we should send ACK to the sender to inform
 // the rx space is now large. We also want ensure a series of small read()'s
 // won't trigger a flood of spurious tiny ACK's.
@@ -1316,7 +1319,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 // crossed will be true if the window size crossed the ACK threshold.
 // above will be true if the new window is >= ACK threshold and false
 // otherwise.
-func (e *endpoint) windowCrossedACKThreshold(deltaBefore int) (crossed bool, above bool) {
+//
+// Precondition: e.mu and e.rcvListMu must be held.
+func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) {
 	newAvail := e.receiveBufferAvailableLocked()
 	oldAvail := newAvail - deltaBefore
 	if oldAvail < 0 {
@@ -1379,6 +1384,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 
 		mask := uint32(notifyReceiveWindowChanged)
 
+		e.mu.RLock()
 		e.rcvListMu.Lock()
 
 		// Make sure the receive buffer size allows us to send a
@@ -1405,11 +1411,11 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		// Immediately send an ACK to uncork the sender silly window
 		// syndrome prevetion, when our available space grows above aMSS
 		// or half receive buffer, whichever smaller.
-		if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
+		if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
 			mask |= notifyNonZeroReceiveWindow
 		}
 		e.rcvListMu.Unlock()
-
+		e.mu.RUnlock()
 		e.notifyProtocolGoroutine(mask)
 		return nil
 
@@ -2414,13 +2420,14 @@ func (e *endpoint) updateSndBufferUsage(v int) {
 // to be read, or when the connection is closed for receiving (in which case
 // s will be nil).
 func (e *endpoint) readyToRead(s *segment) {
+	e.mu.RLock()
 	e.rcvListMu.Lock()
 	if s != nil {
 		s.incRef()
 		e.rcvBufUsed += s.data.Size()
 		// Increase counter if the receive window falls down below MSS
 		// or half receive buffer size, whichever smaller.
-		if crossed, above := e.windowCrossedACKThreshold(-s.data.Size()); crossed && !above {
+		if crossed, above := e.windowCrossedACKThresholdLocked(-s.data.Size()); crossed && !above {
 			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
 		}
 		e.rcvList.PushBack(s)
@@ -2428,7 +2435,7 @@ func (e *endpoint) readyToRead(s *segment) {
 		e.rcvClosed = true
 	}
 	e.rcvListMu.Unlock()
-
+	e.mu.RUnlock()
 	e.waiterQueue.Notify(waiter.EventIn)
 }
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index c4591a3b9..579463384 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1349,6 +1349,21 @@ TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) {
               SyscallFailsWithErrno(ENOTCONN));
 }
 
+TEST_P(SimpleTcpSocketTest, TCPConnectSoRcvBufRace) {
+  auto s = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(GetParam(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+  sockaddr_storage addr =
+      ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+  socklen_t addrlen = sizeof(addr);
+
+  RetryEINTR(connect)(s.get(), reinterpret_cast<struct sockaddr*>(&addr),
+                      addrlen);
+  int buf_sz = 1 << 18;
+  EXPECT_THAT(
+      setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &buf_sz, sizeof(buf_sz)),
+      SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
                          ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From 43abb24657e737dee1108ff0d512b2e1b6d8a3f6 Mon Sep 17 00:00:00 2001
From: Nayana Bidari <nybidari@google.com>
Date: Mon, 2 Mar 2020 16:30:51 -0800
Subject: Fix panic caused by invalid address for Bind in packet sockets.

PiperOrigin-RevId: 298476533
---
 pkg/sentry/socket/netstack/netstack.go |  4 ++++
 test/syscalls/linux/packet_socket.cc   | 13 +++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 1eeb37446..13a9a60b4 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -712,6 +712,10 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	if len(sockaddr) < 2 {
+		return syserr.ErrInvalidArgument
+	}
+
 	family := usermem.ByteOrder.Uint16(sockaddr)
 	var addr tcpip.FullAddress
 
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
index bc22de788..248762ca9 100644
--- a/test/syscalls/linux/packet_socket.cc
+++ b/test/syscalls/linux/packet_socket.cc
@@ -417,6 +417,19 @@ TEST_P(CookedPacketTest, BindDrop) {
   EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
 }
 
+// Bind with invalid address.
+TEST_P(CookedPacketTest, BindFail) {
+  // Null address.
+  ASSERT_THAT(bind(socket_, nullptr, sizeof(struct sockaddr)),
+              SyscallFailsWithErrno(EFAULT));
+
+  // Address of size 1.
+  uint8_t addr = 0;
+  ASSERT_THAT(
+      bind(socket_, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallFailsWithErrno(EINVAL));
+}
+
 INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
                          ::testing::Values(ETH_P_IP, ETH_P_ALL));
 
-- 
cgit v1.2.3


From fc3a09cd3c56ef20fd398a5f61a5e59111ed55b3 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Tue, 3 Mar 2020 17:45:10 +0800
Subject: code clean: minor changes to compatible with ubuntu18.04

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/syscalls/linux/bad.cc     | 2 +-
 test/syscalls/linux/seccomp.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/bad.cc b/test/syscalls/linux/bad.cc
index adfb149df..a26fc6af3 100644
--- a/test/syscalls/linux/bad.cc
+++ b/test/syscalls/linux/bad.cc
@@ -28,7 +28,7 @@ namespace {
 constexpr uint32_t kNotImplementedSyscall = SYS_get_kernel_syms;
 #elif __aarch64__
 // Use the last of arch_specific_syscalls which are not implemented on arm64.
-constexpr uint32_t kNotImplementedSyscall = SYS_arch_specific_syscall + 15;
+constexpr uint32_t kNotImplementedSyscall = __NR_arch_specific_syscall + 15;
 #endif
 
 TEST(BadSyscallTest, NotImplemented) {
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index cf6499f8b..8e0fc9acc 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -53,7 +53,7 @@ namespace {
 constexpr uint32_t kFilteredSyscall = SYS_vserver;
 #elif __aarch64__
 // Use the last of arch_specific_syscalls which are not implemented on arm64.
-constexpr uint32_t kFilteredSyscall = SYS_arch_specific_syscall + 15;
+constexpr uint32_t kFilteredSyscall = __NR_arch_specific_syscall + 15;
 #endif
 
 // Applies a seccomp-bpf filter that returns `filtered_result` for
-- 
cgit v1.2.3


From b3c549d8391e7cadd82a5ab9280bc63bb372aa97 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 3 Mar 2020 12:36:37 -0800
Subject: Move temp_umask to test/util.

PiperOrigin-RevId: 298667595
---
 test/syscalls/linux/BUILD          |  9 ++-------
 test/syscalls/linux/mkdir.cc       |  2 +-
 test/syscalls/linux/open_create.cc |  2 +-
 test/syscalls/linux/temp_umask.h   | 39 --------------------------------------
 test/util/BUILD                    |  6 ++++++
 test/util/temp_umask.h             | 39 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 49 insertions(+), 48 deletions(-)
 delete mode 100644 test/syscalls/linux/temp_umask.h
 create mode 100644 test/util/temp_umask.h

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 70c120e42..dae2b1077 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -166,11 +166,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "temp_umask",
-    hdrs = ["temp_umask.h"],
-)
-
 cc_library(
     name = "unix_domain_socket_test_util",
     testonly = 1,
@@ -1140,11 +1135,11 @@ cc_binary(
     srcs = ["mkdir.cc"],
     linkstatic = 1,
     deps = [
-        ":temp_umask",
         "//test/util:capability_util",
         "//test/util:fs_util",
         gtest,
         "//test/util:temp_path",
+        "//test/util:temp_umask",
         "//test/util:test_main",
         "//test/util:test_util",
     ],
@@ -1299,12 +1294,12 @@ cc_binary(
     srcs = ["open_create.cc"],
     linkstatic = 1,
     deps = [
-        ":temp_umask",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
         gtest,
         "//test/util:temp_path",
+        "//test/util:temp_umask",
         "//test/util:test_main",
         "//test/util:test_util",
     ],
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index cf138d328..def4c50a4 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -18,10 +18,10 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/temp_umask.h"
 #include "test/util/capability_util.h"
 #include "test/util/fs_util.h"
 #include "test/util/temp_path.h"
+#include "test/util/temp_umask.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index 902d0a0dc..51eacf3f2 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -19,11 +19,11 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
-#include "test/syscalls/linux/temp_umask.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
 #include "test/util/temp_path.h"
+#include "test/util/temp_umask.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
diff --git a/test/syscalls/linux/temp_umask.h b/test/syscalls/linux/temp_umask.h
deleted file mode 100644
index 81a25440c..000000000
--- a/test/syscalls/linux/temp_umask.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
-#define GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
-
-#include <sys/stat.h>
-#include <sys/types.h>
-
-namespace gvisor {
-namespace testing {
-
-class TempUmask {
- public:
-  // Sets the process umask to `mask`.
-  explicit TempUmask(mode_t mask) : old_mask_(umask(mask)) {}
-
-  // Sets the process umask to its previous value.
-  ~TempUmask() { umask(old_mask_); }
-
- private:
-  mode_t old_mask_;
-};
-
-}  // namespace testing
-}  // namespace gvisor
-
-#endif  // GVISOR_TEST_SYSCALLS_TEMP_UMASK_H_
diff --git a/test/util/BUILD b/test/util/BUILD
index 8b5a0f25c..2a17c33ee 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -350,3 +350,9 @@ cc_library(
         ":save_util",
     ],
 )
+
+cc_library(
+    name = "temp_umask",
+    testonly = 1,
+    hdrs = ["temp_umask.h"],
+)
diff --git a/test/util/temp_umask.h b/test/util/temp_umask.h
new file mode 100644
index 000000000..e7de84a54
--- /dev/null
+++ b/test/util/temp_umask.h
@@ -0,0 +1,39 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_TEMP_UMASK_H_
+#define GVISOR_TEST_UTIL_TEMP_UMASK_H_
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+namespace gvisor {
+namespace testing {
+
+class TempUmask {
+ public:
+  // Sets the process umask to `mask`.
+  explicit TempUmask(mode_t mask) : old_mask_(umask(mask)) {}
+
+  // Sets the process umask to its previous value.
+  ~TempUmask() { umask(old_mask_); }
+
+ private:
+  mode_t old_mask_;
+};
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_TEMP_UMASK_H_
-- 
cgit v1.2.3


From 504c9e14d61a9ca9fa3615290a05471684019ecc Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Tue, 3 Mar 2020 15:53:48 -0800
Subject: test/runner: use proper filters for test cases

The benchmark_filter options accepts regex-s, but
the gtest-filter option accepts shell-like wildcards.

Fixes #2034

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 test/runner/gtest/gtest.go             | 7 ++++---
 test/syscalls/linux/tuntap_hostinet.cc | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/runner/gtest/gtest.go b/test/runner/gtest/gtest.go
index f96e2415e..869169ad5 100644
--- a/test/runner/gtest/gtest.go
+++ b/test/runner/gtest/gtest.go
@@ -66,13 +66,12 @@ func (tc TestCase) Args() []string {
 	}
 	if tc.benchmark {
 		return []string{
-			fmt.Sprintf("%s=^$", filterTestFlag),
 			fmt.Sprintf("%s=^%s$", filterBenchmarkFlag, tc.Name),
+			fmt.Sprintf("%s=", filterTestFlag),
 		}
 	}
 	return []string{
-		fmt.Sprintf("%s=^%s$", filterTestFlag, tc.FullName()),
-		fmt.Sprintf("%s=^$", filterBenchmarkFlag),
+		fmt.Sprintf("%s=%s", filterTestFlag, tc.FullName()),
 	}
 }
 
@@ -147,6 +146,8 @@ func ParseTestCases(testBin string, benchmarks bool, extraArgs ...string) ([]Tes
 		return nil, fmt.Errorf("could not enumerate gtest benchmarks: %v\nstderr\n%s", err, exitErr.Stderr)
 	}
 
+	out = []byte(strings.Trim(string(out), "\n"))
+
 	// Parse benchmark output.
 	for _, line := range strings.Split(string(out), "\n") {
 		// Strip comments.
diff --git a/test/syscalls/linux/tuntap_hostinet.cc b/test/syscalls/linux/tuntap_hostinet.cc
index 0c527419e..1513fb9d5 100644
--- a/test/syscalls/linux/tuntap_hostinet.cc
+++ b/test/syscalls/linux/tuntap_hostinet.cc
@@ -26,6 +26,7 @@ namespace {
 
 TEST(TuntapHostInetTest, NoNetTun) {
   SKIP_IF(!IsRunningOnGvisor());
+  SKIP_IF(!IsRunningWithHostinet());
 
   struct stat statbuf;
   ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallFailsWithErrno(ENOENT));
-- 
cgit v1.2.3


From da48fc6cca23a38faef51c5b5f8ae609940773a0 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Thu, 5 Mar 2020 18:21:39 -0800
Subject: Stub oom_score_adj and oom_score.

Adds an oom_score_adj and oom_score proc file stub. oom_score_adj accepts
writes of values -1000 to 1000 and persists the value with the task. New tasks
inherit the parent's oom_score_adj.

oom_score is a read-only stub that always returns the value '0'.

Issue #202

PiperOrigin-RevId: 299245355
---
 pkg/sentry/fs/proc/task.go               | 126 ++++++++++++++++++++++++++-----
 pkg/sentry/fsimpl/proc/task.go           |  12 +--
 pkg/sentry/fsimpl/proc/task_files.go     |  43 +++++++++++
 pkg/sentry/fsimpl/proc/tasks_test.go     |  32 ++++----
 pkg/sentry/kernel/task.go                |  33 ++++++++
 pkg/sentry/kernel/task_clone.go          |   6 ++
 pkg/sentry/kernel/task_start.go          |   4 +
 test/syscalls/BUILD                      |   8 +-
 test/syscalls/linux/BUILD                |  13 ++++
 test/syscalls/linux/proc.cc              |  21 ++++++
 test/syscalls/linux/proc_pid_oomscore.cc |  72 ++++++++++++++++++
 11 files changed, 330 insertions(+), 40 deletions(-)
 create mode 100644 test/syscalls/linux/proc_pid_oomscore.cc

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 8ab8d8a02..4e9b0fc00 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -72,24 +72,26 @@ var _ fs.InodeOperations = (*taskDir)(nil)
 // newTaskDir creates a new proc task entry.
 func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
 	contents := map[string]*fs.Inode{
-		"auxv":      newAuxvec(t, msrc),
-		"cmdline":   newExecArgInode(t, msrc, cmdlineExecArg),
-		"comm":      newComm(t, msrc),
-		"environ":   newExecArgInode(t, msrc, environExecArg),
-		"exe":       newExe(t, msrc),
-		"fd":        newFdDir(t, msrc),
-		"fdinfo":    newFdInfoDir(t, msrc),
-		"gid_map":   newGIDMap(t, msrc),
-		"io":        newIO(t, msrc, isThreadGroup),
-		"maps":      newMaps(t, msrc),
-		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
-		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
-		"ns":        newNamespaceDir(t, msrc),
-		"smaps":     newSmaps(t, msrc),
-		"stat":      newTaskStat(t, msrc, isThreadGroup, p.pidns),
-		"statm":     newStatm(t, msrc),
-		"status":    newStatus(t, msrc, p.pidns),
-		"uid_map":   newUIDMap(t, msrc),
+		"auxv":          newAuxvec(t, msrc),
+		"cmdline":       newExecArgInode(t, msrc, cmdlineExecArg),
+		"comm":          newComm(t, msrc),
+		"environ":       newExecArgInode(t, msrc, environExecArg),
+		"exe":           newExe(t, msrc),
+		"fd":            newFdDir(t, msrc),
+		"fdinfo":        newFdInfoDir(t, msrc),
+		"gid_map":       newGIDMap(t, msrc),
+		"io":            newIO(t, msrc, isThreadGroup),
+		"maps":          newMaps(t, msrc),
+		"mountinfo":     seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
+		"mounts":        seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		"ns":            newNamespaceDir(t, msrc),
+		"oom_score":     newOOMScore(t, msrc),
+		"oom_score_adj": newOOMScoreAdj(t, msrc),
+		"smaps":         newSmaps(t, msrc),
+		"stat":          newTaskStat(t, msrc, isThreadGroup, p.pidns),
+		"statm":         newStatm(t, msrc),
+		"status":        newStatus(t, msrc, p.pidns),
+		"uid_map":       newUIDMap(t, msrc),
 	}
 	if isThreadGroup {
 		contents["task"] = p.newSubtasks(t, msrc)
@@ -796,4 +798,92 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc
 	return int64(n), err
 }
 
+// newOOMScore returns a oom_score file. It is a stub that always returns 0.
+// TODO(gvisor.dev/issue/1967)
+func newOOMScore(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	return newStaticProcInode(t, msrc, []byte("0\n"))
+}
+
+// oomScoreAdj is a file containing the oom_score adjustment for a task.
+//
+// +stateify savable
+type oomScoreAdj struct {
+	fsutil.SimpleFileInode
+
+	t *kernel.Task
+}
+
+// +stateify savable
+type oomScoreAdjFile struct {
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+
+	t *kernel.Task
+}
+
+// newOOMScoreAdj returns a oom_score_adj file.
+func newOOMScoreAdj(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	i := &oomScoreAdj{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
+		t:               t,
+	}
+	return newProcInode(t, i, msrc, fs.SpecialFile, t)
+}
+
+// Truncate implements fs.InodeOperations.Truncate. Truncate is called when
+// O_TRUNC is specified for any kind of existing Dirent but is not called via
+// (f)truncate for proc files.
+func (*oomScoreAdj) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &oomScoreAdjFile{t: o.t}), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		return 0, io.EOF
+	}
+	adj, err := f.t.OOMScoreAdj()
+	if err != nil {
+		return 0, err
+	}
+	adjBytes := []byte(strconv.FormatInt(int64(adj), 10) + "\n")
+	n, err := dst.CopyOut(ctx, adjBytes)
+	return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+
+	if err := f.t.SetOOMScoreAdj(v); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
+
 // LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go)
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 2d814668a..18e5cd6f6 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -62,11 +62,13 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames
 			"pid":  newNamespaceSymlink(task, inoGen.NextIno(), "pid"),
 			"user": newNamespaceSymlink(task, inoGen.NextIno(), "user"),
 		}),
-		"smaps":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
-		"stat":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
-		"statm":   newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}),
-		"status":  newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
-		"uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}),
+		"oom_score":     newTaskOwnedFile(task, inoGen.NextIno(), 0444, newStaticFile("0\n")),
+		"oom_score_adj": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &oomScoreAdj{task: task}),
+		"smaps":         newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}),
+		"stat":          newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":         newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}),
+		"status":        newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+		"uid_map":       newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}),
 	}
 	if isThreadGroup {
 		contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers)
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index efd3b3453..5a231ac86 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -525,3 +525,46 @@ func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
 	return nil
 }
+
+// oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file.
+//
+// +stateify savable
+type oomScoreAdj struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	adj, err := o.task.OOMScoreAdj()
+	if err != nil {
+		return err
+	}
+	fmt.Fprintf(buf, "%d\n", adj)
+	return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+
+	if err := o.task.SetOOMScoreAdj(v); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index c5d531fe0..0eb401619 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -63,21 +63,23 @@ var (
 		"thread-self": threadSelfLink.NextOff,
 	}
 	taskStaticFiles = map[string]testutil.DirentType{
-		"auxv":    linux.DT_REG,
-		"cgroup":  linux.DT_REG,
-		"cmdline": linux.DT_REG,
-		"comm":    linux.DT_REG,
-		"environ": linux.DT_REG,
-		"gid_map": linux.DT_REG,
-		"io":      linux.DT_REG,
-		"maps":    linux.DT_REG,
-		"ns":      linux.DT_DIR,
-		"smaps":   linux.DT_REG,
-		"stat":    linux.DT_REG,
-		"statm":   linux.DT_REG,
-		"status":  linux.DT_REG,
-		"task":    linux.DT_DIR,
-		"uid_map": linux.DT_REG,
+		"auxv":          linux.DT_REG,
+		"cgroup":        linux.DT_REG,
+		"cmdline":       linux.DT_REG,
+		"comm":          linux.DT_REG,
+		"environ":       linux.DT_REG,
+		"gid_map":       linux.DT_REG,
+		"io":            linux.DT_REG,
+		"maps":          linux.DT_REG,
+		"ns":            linux.DT_DIR,
+		"oom_score":     linux.DT_REG,
+		"oom_score_adj": linux.DT_REG,
+		"smaps":         linux.DT_REG,
+		"stat":          linux.DT_REG,
+		"statm":         linux.DT_REG,
+		"status":        linux.DT_REG,
+		"task":          linux.DT_DIR,
+		"uid_map":       linux.DT_REG,
 	}
 )
 
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 2cee2e6ed..c0dbbe890 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -554,6 +555,13 @@ type Task struct {
 	//
 	// startTime is protected by mu.
 	startTime ktime.Time
+
+	// oomScoreAdj is the task's OOM score adjustment. This is currently not
+	// used but is maintained for consistency.
+	// TODO(gvisor.dev/issue/1967)
+	//
+	// oomScoreAdj is protected by mu, and is owned by the task goroutine.
+	oomScoreAdj int32
 }
 
 func (t *Task) savePtraceTracer() *Task {
@@ -847,3 +855,28 @@ func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 func (t *Task) ContainerID() string {
 	return t.containerID
 }
+
+// OOMScoreAdj gets the task's OOM score adjustment.
+func (t *Task) OOMScoreAdj() (int32, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.ExitState() == TaskExitDead {
+		return 0, syserror.ESRCH
+	}
+	return t.oomScoreAdj, nil
+}
+
+// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be
+// between -1000 and 1000 inclusive.
+func (t *Task) SetOOMScoreAdj(adj int32) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.ExitState() == TaskExitDead {
+		return syserror.ESRCH
+	}
+	if adj > 1000 || adj < -1000 {
+		return syserror.EINVAL
+	}
+	t.oomScoreAdj = adj
+	return nil
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 78866f280..dda502bb8 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -264,6 +264,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		rseqSignature = t.rseqSignature
 	}
 
+	adj, err := t.OOMScoreAdj()
+	if err != nil {
+		return 0, nil, err
+	}
+
 	cfg := &TaskConfig{
 		Kernel:                  t.k,
 		ThreadGroup:             tg,
@@ -282,6 +287,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		RSeqAddr:                rseqAddr,
 		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
+		OOMScoreAdj:             adj,
 	}
 	if opts.NewThreadGroup {
 		cfg.Parent = t
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index a5035bb7f..2bbf48bb8 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -93,6 +93,9 @@ type TaskConfig struct {
 
 	// ContainerID is the container the new task belongs to.
 	ContainerID string
+
+	// oomScoreAdj is the task's OOM score adjustment.
+	OOMScoreAdj int32
 }
 
 // NewTask creates a new task defined by cfg.
@@ -143,6 +146,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		rseqSignature:      cfg.RSeqSignature,
 		futexWaiter:        futex.NewWaiter(),
 		containerID:        cfg.ContainerID,
+		oomScoreAdj:        cfg.OOMScoreAdj,
 	}
 	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index a69b0ce13..9800a0cdf 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -318,10 +318,14 @@ syscall_test(
     test = "//test/syscalls/linux:proc_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test")
-
 syscall_test(test = "//test/syscalls/linux:proc_net_test")
 
+syscall_test(test = "//test/syscalls/linux:proc_pid_oomscore_test")
+
+syscall_test(test = "//test/syscalls/linux:proc_pid_smaps_test")
+
+syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test")
+
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:pselect_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 0fbd556de..43455f1a3 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1631,6 +1631,19 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "proc_pid_oomscore_test",
+    testonly = 1,
+    srcs = ["proc_pid_oomscore.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:fs_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_binary(
     name = "proc_pid_smaps_test",
     testonly = 1,
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index f91187e75..5a70f6c3b 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1431,6 +1431,12 @@ TEST(ProcPidFile, SubprocessRunning) {
 
   EXPECT_THAT(ReadWhileRunning("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("oom_score", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileRunning("oom_score_adj", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
 }
 
 // Test whether /proc/PID/ files can be read for a zombie process.
@@ -1466,6 +1472,12 @@ TEST(ProcPidFile, SubprocessZombie) {
   EXPECT_THAT(ReadWhileZombied("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
 
+  EXPECT_THAT(ReadWhileZombied("oom_score", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  EXPECT_THAT(ReadWhileZombied("oom_score_adj", buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
   // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
   //
@@ -1527,6 +1539,15 @@ TEST(ProcPidFile, SubprocessExited) {
 
   EXPECT_THAT(ReadWhileExited("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
+
+  if (!IsRunningOnGvisor()) {
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
+    EXPECT_THAT(ReadWhileExited("oom_score", buf, sizeof(buf)),
+                SyscallFailsWithErrno(ESRCH));
+  }
+
+  EXPECT_THAT(ReadWhileExited("oom_score_adj", buf, sizeof(buf)),
+              SyscallFailsWithErrno(ESRCH));
 }
 
 PosixError DirContainsImpl(absl::string_view path,
diff --git a/test/syscalls/linux/proc_pid_oomscore.cc b/test/syscalls/linux/proc_pid_oomscore.cc
new file mode 100644
index 000000000..707821a3f
--- /dev/null
+++ b/test/syscalls/linux/proc_pid_oomscore.cc
@@ -0,0 +1,72 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+
+#include <exception>
+#include <iostream>
+#include <string>
+
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<int> ReadProcNumber(std::string path) {
+  ASSIGN_OR_RETURN_ERRNO(std::string contents, GetContents(path));
+  EXPECT_EQ(contents[contents.length() - 1], '\n');
+
+  int num;
+  if (!absl::SimpleAtoi(contents, &num)) {
+    return PosixError(EINVAL, "invalid value: " + contents);
+  }
+
+  return num;
+}
+
+TEST(ProcPidOomscoreTest, BasicRead) {
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score"));
+  EXPECT_LE(oom_score, 1000);
+  EXPECT_GE(oom_score, -1000);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicRead) {
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+
+  // oom_score_adj defaults to 0.
+  EXPECT_EQ(oom_score, 0);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicWrite) {
+  constexpr int test_value = 7;
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/oom_score_adj", O_WRONLY));
+  ASSERT_THAT(
+      RetryEINTR(write)(fd.get(), std::to_string(test_value).c_str(), 1),
+      SyscallSucceeds());
+
+  auto const oom_score =
+      ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+  EXPECT_EQ(oom_score, test_value);
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From d5dbe366bf7c9f5b648b8114a9dc7f45589899b1 Mon Sep 17 00:00:00 2001
From: Eyal Soha <eyalsoha@google.com>
Date: Fri, 6 Mar 2020 11:41:10 -0800
Subject: shutdown(s, SHUT_WR) in TIME-WAIT returns ENOTCONN

From RFC 793 s3.9 p61 Event Processing:

CLOSE Call during TIME-WAIT: return with "error: connection closing"

Fixes #1603

PiperOrigin-RevId: 299401353
---
 pkg/tcpip/transport/tcp/endpoint.go |  5 ++++-
 test/syscalls/linux/tcp_socket.cc   | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 40cc664c0..dc9c18b6f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2117,10 +2117,13 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 		// Close for write.
 		if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 {
 			e.sndBufMu.Lock()
-
 			if e.sndClosed {
 				// Already closed.
 				e.sndBufMu.Unlock()
+				if e.EndpointState() == StateTimeWait {
+					e.mu.Unlock()
+					return tcpip.ErrNotConnected
+				}
 				break
 			}
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 579463384..d9c1ac0e1 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -143,6 +143,20 @@ TEST_P(TcpSocketTest, ConnectOnEstablishedConnection) {
       SyscallFailsWithErrno(EISCONN));
 }
 
+TEST_P(TcpSocketTest, ShutdownWriteInTimeWait) {
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  EXPECT_THAT(shutdown(s_, SHUT_RDWR), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));  // Wait to enter TIME_WAIT.
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(TcpSocketTest, ShutdownWriteInFinWait1) {
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));  // Wait to enter FIN-WAIT2.
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+}
+
 TEST_P(TcpSocketTest, DataCoalesced) {
   char buf[10];
 
-- 
cgit v1.2.3


From b36de6e7be0542b410901d3cbcd1b3c0fc493cf5 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Mon, 9 Mar 2020 19:57:35 -0700
Subject: Move /proc/net to /proc/PID/net, and make /proc/net ->
 /proc/self/net.

Issue #1833

PiperOrigin-RevId: 299998105
---
 pkg/sentry/fs/proc/net.go            |  53 +--
 pkg/sentry/fs/proc/proc.go           |   2 +-
 pkg/sentry/fs/proc/task.go           |   1 +
 pkg/sentry/fsimpl/proc/BUILD         |   2 +-
 pkg/sentry/fsimpl/proc/task.go       |   1 +
 pkg/sentry/fsimpl/proc/task_net.go   | 790 +++++++++++++++++++++++++++++++++++
 pkg/sentry/fsimpl/proc/tasks.go      |   2 +-
 pkg/sentry/fsimpl/proc/tasks_net.go  | 787 ----------------------------------
 pkg/sentry/fsimpl/proc/tasks_test.go |   3 +-
 test/syscalls/linux/proc_net.cc      |  25 ++
 10 files changed, 849 insertions(+), 817 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/proc/task_net.go
 delete mode 100644 pkg/sentry/fsimpl/proc/tasks_net.go

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 95d5817ff..bd18177d4 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -40,47 +40,48 @@ import (
 
 // LINT.IfChange
 
-// newNet creates a new proc net entry.
-func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode {
+// newNetDir creates a new proc net entry.
+func newNetDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	k := t.Kernel()
+
 	var contents map[string]*fs.Inode
-	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
-	// network namespace of the calling process. We should make this per-process,
-	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
-	if s := p.k.RootNetworkNamespace().Stack(); s != nil {
+	if s := t.NetworkNamespace().Stack(); s != nil {
+		// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
+		// network namespace.
 		contents = map[string]*fs.Inode{
-			"dev":  seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
-			"snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc),
+			"dev":  seqfile.NewSeqFileInode(t, &netDev{s: s}, msrc),
+			"snmp": seqfile.NewSeqFileInode(t, &netSnmp{s: s}, msrc),
 
 			// The following files are simple stubs until they are
 			// implemented in netstack, if the file contains a
 			// header the stub is just the header otherwise it is
 			// an empty file.
-			"arp": newStaticProcInode(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device\n")),
+			"arp": newStaticProcInode(t, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device\n")),
 
-			"netlink":   newStaticProcInode(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n")),
-			"netstat":   newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")),
-			"packet":    newStaticProcInode(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n")),
-			"protocols": newStaticProcInode(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")),
+			"netlink":   newStaticProcInode(t, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n")),
+			"netstat":   newStaticProcInode(t, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")),
+			"packet":    newStaticProcInode(t, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n")),
+			"protocols": newStaticProcInode(t, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")),
 			// Linux sets psched values to: nsec per usec, psched
 			// tick in ns, 1000000, high res timer ticks per sec
 			// (ClockGetres returns 1ns resolution).
-			"psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))),
-			"ptype":  newStaticProcInode(ctx, msrc, []byte("Type Device      Function\n")),
-			"route":  seqfile.NewSeqFileInode(ctx, &netRoute{s: s}, msrc),
-			"tcp":    seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc),
-			"udp":    seqfile.NewSeqFileInode(ctx, &netUDP{k: k}, msrc),
-			"unix":   seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc),
+			"psched": newStaticProcInode(t, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))),
+			"ptype":  newStaticProcInode(t, msrc, []byte("Type Device      Function\n")),
+			"route":  seqfile.NewSeqFileInode(t, &netRoute{s: s}, msrc),
+			"tcp":    seqfile.NewSeqFileInode(t, &netTCP{k: k}, msrc),
+			"udp":    seqfile.NewSeqFileInode(t, &netUDP{k: k}, msrc),
+			"unix":   seqfile.NewSeqFileInode(t, &netUnix{k: k}, msrc),
 		}
 
 		if s.SupportsIPv6() {
-			contents["if_inet6"] = seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc)
-			contents["ipv6_route"] = newStaticProcInode(ctx, msrc, []byte(""))
-			contents["tcp6"] = seqfile.NewSeqFileInode(ctx, &netTCP6{k: k}, msrc)
-			contents["udp6"] = newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"))
+			contents["if_inet6"] = seqfile.NewSeqFileInode(t, &ifinet6{s: s}, msrc)
+			contents["ipv6_route"] = newStaticProcInode(t, msrc, []byte(""))
+			contents["tcp6"] = seqfile.NewSeqFileInode(t, &netTCP6{k: k}, msrc)
+			contents["udp6"] = newStaticProcInode(t, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"))
 		}
 	}
-	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
-	return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(t, d, msrc, fs.SpecialDirectory, t)
 }
 
 // ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6.
@@ -837,4 +838,4 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 	return data, 0
 }
 
-// LINT.ThenChange(../../fsimpl/proc/tasks_net.go)
+// LINT.ThenChange(../../fsimpl/proc/task_net.go)
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index c8abb5052..c659224a7 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -70,6 +70,7 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string
 		"loadavg":     seqfile.NewSeqFileInode(ctx, &loadavgData{}, msrc),
 		"meminfo":     seqfile.NewSeqFileInode(ctx, &meminfoData{k}, msrc),
 		"mounts":      newProcInode(ctx, ramfs.NewSymlink(ctx, fs.RootOwner, "self/mounts"), msrc, fs.Symlink, nil),
+		"net":         newProcInode(ctx, ramfs.NewSymlink(ctx, fs.RootOwner, "self/net"), msrc, fs.Symlink, nil),
 		"self":        newSelf(ctx, pidns, msrc),
 		"stat":        seqfile.NewSeqFileInode(ctx, &statData{k}, msrc),
 		"thread-self": newThreadSelf(ctx, pidns, msrc),
@@ -86,7 +87,6 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string
 	}
 
 	// Add more contents that need proc to be initialized.
-	p.AddChild(ctx, "net", p.newNetDir(ctx, k, msrc))
 	p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc))
 
 	return newProcInode(ctx, p, msrc, fs.SpecialDirectory, nil), nil
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 4e9b0fc00..03cc788c8 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -84,6 +84,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bo
 		"maps":          newMaps(t, msrc),
 		"mountinfo":     seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
 		"mounts":        seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		"net":           newNetDir(t, msrc),
 		"ns":            newNamespaceDir(t, msrc),
 		"oom_score":     newOOMScore(t, msrc),
 		"oom_score_adj": newOOMScoreAdj(t, msrc),
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index a83245866..bb609a305 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -9,9 +9,9 @@ go_library(
         "subtasks.go",
         "task.go",
         "task_files.go",
+        "task_net.go",
         "tasks.go",
         "tasks_files.go",
-        "tasks_net.go",
         "tasks_sys.go",
     ],
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index c0d643f51..493acbd1b 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -57,6 +57,7 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames
 		"maps":    newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}),
 		//"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
 		//"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		"net": newTaskNetDir(task, inoGen),
 		"ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{
 			"net":  newNamespaceSymlink(task, inoGen.NextIno(), "net"),
 			"pid":  newNamespaceSymlink(task, inoGen.NextIno(), "pid"),
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
new file mode 100644
index 000000000..373a7b17d
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -0,0 +1,790 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"reflect"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func newTaskNetDir(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry {
+	k := task.Kernel()
+	pidns := task.PIDNamespace()
+	root := auth.NewRootCredentials(pidns.UserNamespace())
+
+	var contents map[string]*kernfs.Dentry
+	if stack := task.NetworkNamespace().Stack(); stack != nil {
+		const (
+			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
+			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
+			packet    = "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n"
+			protocols = "protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"
+			ptype     = "Type Device      Function\n"
+			upd6      = "  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"
+		)
+		psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))
+
+		// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
+		// network namespace.
+		contents = map[string]*kernfs.Dentry{
+			"dev":  newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}),
+			"snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}),
+
+			// The following files are simple stubs until they are implemented in
+			// netstack, if the file contains a header the stub is just the header
+			// otherwise it is an empty file.
+			"arp":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)),
+			"netlink":   newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)),
+			"netstat":   newDentry(root, inoGen.NextIno(), 0444, &netStatData{}),
+			"packet":    newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)),
+			"protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)),
+
+			// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
+			// high res timer ticks per sec (ClockGetres returns 1ns resolution).
+			"psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)),
+			"ptype":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)),
+			"route":  newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}),
+			"tcp":    newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}),
+			"udp":    newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}),
+			"unix":   newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}),
+		}
+
+		if stack.SupportsIPv6() {
+			contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack})
+			contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(""))
+			contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k})
+			contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6))
+		}
+	}
+
+	return newTaskOwnedDir(task, inoGen.NextIno(), 0555, contents)
+}
+
+// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
+//
+// +stateify savable
+type ifinet6 struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*ifinet6)(nil)
+
+func (n *ifinet6) contents() []string {
+	var lines []string
+	nics := n.stack.Interfaces()
+	for id, naddrs := range n.stack.InterfaceAddrs() {
+		nic, ok := nics[id]
+		if !ok {
+			// NIC was added after NICNames was called. We'll just ignore it.
+			continue
+		}
+
+		for _, a := range naddrs {
+			// IPv6 only.
+			if a.Family != linux.AF_INET6 {
+				continue
+			}
+
+			// Fields:
+			// IPv6 address displayed in 32 hexadecimal chars without colons
+			// Netlink device number (interface index) in hexadecimal (use nic id)
+			// Prefix length in hexadecimal
+			// Scope value (use 0)
+			// Interface flags
+			// Device name
+			lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
+		}
+	}
+	return lines
+}
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	for _, l := range n.contents() {
+		buf.WriteString(l)
+	}
+	return nil
+}
+
+// netDevData implements vfs.DynamicBytesSource for /proc/net/dev.
+//
+// +stateify savable
+type netDevData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netDevData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	interfaces := n.stack.Interfaces()
+	buf.WriteString("Inter-|   Receive                                                |  Transmit\n")
+	buf.WriteString(" face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n")
+
+	for _, i := range interfaces {
+		// Implements the same format as
+		// net/core/net-procfs.c:dev_seq_printf_stats.
+		var stats inet.StatDev
+		if err := n.stack.Statistics(&stats, i.Name); err != nil {
+			log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
+			continue
+		}
+		fmt.Fprintf(
+			buf,
+			"%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
+			i.Name,
+			// Received
+			stats[0], // bytes
+			stats[1], // packets
+			stats[2], // errors
+			stats[3], // dropped
+			stats[4], // fifo
+			stats[5], // frame
+			stats[6], // compressed
+			stats[7], // multicast
+			// Transmitted
+			stats[8],  // bytes
+			stats[9],  // packets
+			stats[10], // errors
+			stats[11], // dropped
+			stats[12], // fifo
+			stats[13], // frame
+			stats[14], // compressed
+			stats[15], // multicast
+		)
+	}
+
+	return nil
+}
+
+// netUnixData implements vfs.DynamicBytesSource for /proc/net/unix.
+//
+// +stateify savable
+type netUnixData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netUnixData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
+	for _, se := range n.kernel.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
+			continue
+		}
+		sfile := s.(*fs.File)
+		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
+			s.DecRef()
+			// Not a unix socket.
+			continue
+		}
+		sops := sfile.FileOperations.(*unix.SocketOperations)
+
+		addr, err := sops.Endpoint().GetLocalAddress()
+		if err != nil {
+			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
+			addr.Addr = "<unknown>"
+		}
+
+		sockFlags := 0
+		if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
+			if ce.Listening() {
+				// For unix domain sockets, linux reports a single flag
+				// value if the socket is listening, of __SO_ACCEPTCON.
+				sockFlags = linux.SO_ACCEPTCON
+			}
+		}
+
+		// In the socket entry below, the value for the 'Num' field requires
+		// some consideration. Linux prints the address to the struct
+		// unix_sock representing a socket in the kernel, but may redact the
+		// value for unprivileged users depending on the kptr_restrict
+		// sysctl.
+		//
+		// One use for this field is to allow a privileged user to
+		// introspect into the kernel memory to determine information about
+		// a socket not available through procfs, such as the socket's peer.
+		//
+		// In gvisor, returning a pointer to our internal structures would
+		// be pointless, as it wouldn't match the memory layout for struct
+		// unix_sock, making introspection difficult. We could populate a
+		// struct unix_sock with the appropriate data, but even that
+		// requires consideration for which kernel version to emulate, as
+		// the definition of this struct changes over time.
+		//
+		// For now, we always redact this pointer.
+		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
+			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
+			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
+			0,                             // Protocol, always 0 for UDS.
+			sockFlags,                     // Flags.
+			sops.Endpoint().Type(),        // Type.
+			sops.State(),                  // State.
+			sfile.InodeID(),               // Inode.
+		)
+
+		// Path
+		if len(addr.Addr) != 0 {
+			if addr.Addr[0] == 0 {
+				// Abstract path.
+				fmt.Fprintf(buf, " @%s", string(addr.Addr[1:]))
+			} else {
+				fmt.Fprintf(buf, " %s", string(addr.Addr))
+			}
+		}
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef()
+	}
+	return nil
+}
+
+func networkToHost16(n uint16) uint16 {
+	// n is in network byte order, so is big-endian. The most-significant byte
+	// should be stored in the lower address.
+	//
+	// We manually inline binary.BigEndian.Uint16() because Go does not support
+	// non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to
+	// binary.BigEndian.Uint16() require a read of binary.BigEndian and an
+	// interface method call, defeating inlining.
+	buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)}
+	return usermem.ByteOrder.Uint16(buf[:])
+}
+
+func writeInetAddr(w io.Writer, family int, i linux.SockAddr) {
+	switch family {
+	case linux.AF_INET:
+		var a linux.SockAddrInet
+		if i != nil {
+			a = *i.(*linux.SockAddrInet)
+		}
+
+		// linux.SockAddrInet.Port is stored in the network byte order and is
+		// printed like a number in host byte order. Note that all numbers in host
+		// byte order are printed with the most-significant byte first when
+		// formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux.
+		port := networkToHost16(a.Port)
+
+		// linux.SockAddrInet.Addr is stored as a byte slice in big-endian order
+		// (i.e. most-significant byte in index 0). Linux represents this as a
+		// __be32 which is a typedef for an unsigned int, and is printed with
+		// %X. This means that for a little-endian machine, Linux prints the
+		// least-significant byte of the address first. To emulate this, we first
+		// invert the byte order for the address using usermem.ByteOrder.Uint32,
+		// which makes it have the equivalent encoding to a __be32 on a little
+		// endian machine. Note that this operation is a no-op on a big endian
+		// machine. Then similar to Linux, we format it with %X, which will print
+		// the most-significant byte of the __be32 address first, which is now
+		// actually the least-significant byte of the original address in
+		// linux.SockAddrInet.Addr on little endian machines, due to the conversion.
+		addr := usermem.ByteOrder.Uint32(a.Addr[:])
+
+		fmt.Fprintf(w, "%08X:%04X ", addr, port)
+	case linux.AF_INET6:
+		var a linux.SockAddrInet6
+		if i != nil {
+			a = *i.(*linux.SockAddrInet6)
+		}
+
+		port := networkToHost16(a.Port)
+		addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4])
+		addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8])
+		addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12])
+		addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16])
+		fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port)
+	}
+}
+
+func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error {
+	// t may be nil here if our caller is not part of a task goroutine. This can
+	// happen for example if we're here for "sentryctl cat". When t is nil,
+	// degrade gracefully and retrieve what we can.
+	t := kernel.TaskFromContext(ctx)
+
+	for _, se := range k.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+			continue
+		}
+		sfile := s.(*fs.File)
+		sops, ok := sfile.FileOperations.(socket.Socket)
+		if !ok {
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+		}
+		if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
+			s.DecRef()
+			// Not tcp4 sockets.
+			continue
+		}
+
+		// Linux's documentation for the fields below can be found at
+		// https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
+		// For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
+		// Note that the header doesn't contain labels for all the fields.
+
+		// Field: sl; entry number.
+		fmt.Fprintf(buf, "%4d: ", se.ID)
+
+		// Field: local_adddress.
+		var localAddr linux.SockAddr
+		if t != nil {
+			if local, _, err := sops.GetSockName(t); err == nil {
+				localAddr = local
+			}
+		}
+		writeInetAddr(buf, family, localAddr)
+
+		// Field: rem_address.
+		var remoteAddr linux.SockAddr
+		if t != nil {
+			if remote, _, err := sops.GetPeerName(t); err == nil {
+				remoteAddr = remote
+			}
+		}
+		writeInetAddr(buf, family, remoteAddr)
+
+		// Field: state; socket state.
+		fmt.Fprintf(buf, "%02X ", sops.State())
+
+		// Field: tx_queue, rx_queue; number of packets in the transmit and
+		// receive queue. Unimplemented.
+		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
+
+		// Field: tr, tm->when; timer active state and number of jiffies
+		// until timer expires. Unimplemented.
+		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
+
+		// Field: retrnsmt; number of unrecovered RTO timeouts.
+		// Unimplemented.
+		fmt.Fprintf(buf, "%08X ", 0)
+
+		// Field: uid.
+		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
+		if err != nil {
+			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+			fmt.Fprintf(buf, "%5d ", 0)
+		} else {
+			creds := auth.CredentialsFromContext(ctx)
+			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+		}
+
+		// Field: timeout; number of unanswered 0-window probes.
+		// Unimplemented.
+		fmt.Fprintf(buf, "%8d ", 0)
+
+		// Field: inode.
+		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+
+		// Field: refcount. Don't count the ref we obtain while deferencing
+		// the weakref to this socket.
+		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+
+		// Field: Socket struct address. Redacted due to the same reason as
+		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
+
+		// Field: retransmit timeout. Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: predicted tick of soft clock (delayed ACK control data).
+		// Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: sending congestion window, Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
+		// Unimplemented, report as large threshold.
+		fmt.Fprintf(buf, "%d", -1)
+
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef()
+	}
+
+	return nil
+}
+
+// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp.
+//
+// +stateify savable
+type netTCPData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netTCPData)(nil)
+
+func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
+	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET)
+}
+
+// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6.
+//
+// +stateify savable
+type netTCP6Data struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netTCP6Data)(nil)
+
+func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n")
+	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6)
+}
+
+// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp.
+//
+// +stateify savable
+type netUDPData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netUDPData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	// t may be nil here if our caller is not part of a task goroutine. This can
+	// happen for example if we're here for "sentryctl cat". When t is nil,
+	// degrade gracefully and retrieve what we can.
+	t := kernel.TaskFromContext(ctx)
+
+	for _, se := range d.kernel.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
+			continue
+		}
+		sfile := s.(*fs.File)
+		sops, ok := sfile.FileOperations.(socket.Socket)
+		if !ok {
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+		}
+		if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
+			s.DecRef()
+			// Not udp4 socket.
+			continue
+		}
+
+		// For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock().
+
+		// Field: sl; entry number.
+		fmt.Fprintf(buf, "%5d: ", se.ID)
+
+		// Field: local_adddress.
+		var localAddr linux.SockAddrInet
+		if t != nil {
+			if local, _, err := sops.GetSockName(t); err == nil {
+				localAddr = *local.(*linux.SockAddrInet)
+			}
+		}
+		writeInetAddr(buf, linux.AF_INET, &localAddr)
+
+		// Field: rem_address.
+		var remoteAddr linux.SockAddrInet
+		if t != nil {
+			if remote, _, err := sops.GetPeerName(t); err == nil {
+				remoteAddr = *remote.(*linux.SockAddrInet)
+			}
+		}
+		writeInetAddr(buf, linux.AF_INET, &remoteAddr)
+
+		// Field: state; socket state.
+		fmt.Fprintf(buf, "%02X ", sops.State())
+
+		// Field: tx_queue, rx_queue; number of packets in the transmit and
+		// receive queue. Unimplemented.
+		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
+
+		// Field: tr, tm->when. Always 0 for UDP.
+		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
+
+		// Field: retrnsmt. Always 0 for UDP.
+		fmt.Fprintf(buf, "%08X ", 0)
+
+		// Field: uid.
+		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
+		if err != nil {
+			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+			fmt.Fprintf(buf, "%5d ", 0)
+		} else {
+			creds := auth.CredentialsFromContext(ctx)
+			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
+		}
+
+		// Field: timeout. Always 0 for UDP.
+		fmt.Fprintf(buf, "%8d ", 0)
+
+		// Field: inode.
+		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+
+		// Field: ref; reference count on the socket inode. Don't count the ref
+		// we obtain while deferencing the weakref to this socket.
+		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+
+		// Field: Socket struct address. Redacted due to the same reason as
+		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
+
+		// Field: drops; number of dropped packets. Unimplemented.
+		fmt.Fprintf(buf, "%d", 0)
+
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef()
+	}
+	return nil
+}
+
+// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp.
+//
+// +stateify savable
+type netSnmpData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netSnmpData)(nil)
+
+type snmpLine struct {
+	prefix string
+	header string
+}
+
+var snmp = []snmpLine{
+	{
+		prefix: "Ip",
+		header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates",
+	},
+	{
+		prefix: "Icmp",
+		header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps",
+	},
+	{
+		prefix: "IcmpMsg",
+	},
+	{
+		prefix: "Tcp",
+		header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors",
+	},
+	{
+		prefix: "Udp",
+		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
+	},
+	{
+		prefix: "UdpLite",
+		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
+	},
+}
+
+func toSlice(a interface{}) []uint64 {
+	v := reflect.Indirect(reflect.ValueOf(a))
+	return v.Slice(0, v.Len()).Interface().([]uint64)
+}
+
+func sprintSlice(s []uint64) string {
+	if len(s) == 0 {
+		return ""
+	}
+	r := fmt.Sprint(s)
+	return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
+}
+
+// Generate implements vfs.DynamicBytesSource.
+func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	types := []interface{}{
+		&inet.StatSNMPIP{},
+		&inet.StatSNMPICMP{},
+		nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats.
+		&inet.StatSNMPTCP{},
+		&inet.StatSNMPUDP{},
+		&inet.StatSNMPUDPLite{},
+	}
+	for i, stat := range types {
+		line := snmp[i]
+		if stat == nil {
+			fmt.Fprintf(buf, "%s:\n", line.prefix)
+			fmt.Fprintf(buf, "%s:\n", line.prefix)
+			continue
+		}
+		if err := d.stack.Statistics(stat, line.prefix); err != nil {
+			if err == syserror.EOPNOTSUPP {
+				log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
+			} else {
+				log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
+			}
+		}
+
+		fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header)
+
+		if line.prefix == "Tcp" {
+			tcp := stat.(*inet.StatSNMPTCP)
+			// "Tcp" needs special processing because MaxConn is signed. RFC 2012.
+			fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
+		} else {
+			fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
+		}
+	}
+	return nil
+}
+
+// netRouteData implements vfs.DynamicBytesSource for /proc/net/route.
+//
+// +stateify savable
+type netRouteData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netRouteData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
+func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
+
+	interfaces := d.stack.Interfaces()
+	for _, rt := range d.stack.RouteTable() {
+		// /proc/net/route only includes ipv4 routes.
+		if rt.Family != linux.AF_INET {
+			continue
+		}
+
+		// /proc/net/route does not include broadcast or multicast routes.
+		if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST {
+			continue
+		}
+
+		iface, ok := interfaces[rt.OutputInterface]
+		if !ok || iface.Name == "lo" {
+			continue
+		}
+
+		var (
+			gw     uint32
+			prefix uint32
+			flags  = linux.RTF_UP
+		)
+		if len(rt.GatewayAddr) == header.IPv4AddressSize {
+			flags |= linux.RTF_GATEWAY
+			gw = usermem.ByteOrder.Uint32(rt.GatewayAddr)
+		}
+		if len(rt.DstAddr) == header.IPv4AddressSize {
+			prefix = usermem.ByteOrder.Uint32(rt.DstAddr)
+		}
+		l := fmt.Sprintf(
+			"%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d",
+			iface.Name,
+			prefix,
+			gw,
+			flags,
+			0, // RefCnt.
+			0, // Use.
+			0, // Metric.
+			(uint32(1)<<rt.DstLen)-1,
+			0, // MTU.
+			0, // Window.
+			0, // RTT.
+		)
+		fmt.Fprintf(buf, "%-127s\n", l)
+	}
+	return nil
+}
+
+// netStatData implements vfs.DynamicBytesSource for /proc/net/netstat.
+//
+// +stateify savable
+type netStatData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netStatData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
+func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
+		"EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps " +
+		"LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive " +
+		"PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost " +
+		"ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog " +
+		"TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser " +
+		"TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging " +
+		"TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo " +
+		"TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit " +
+		"TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans " +
+		"TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes " +
+		"TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail " +
+		"TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent " +
+		"TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose " +
+		"TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed " +
+		"TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld " +
+		"TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected " +
+		"TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback " +
+		"TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter " +
+		"TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail " +
+		"TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK " +
+		"TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail " +
+		"TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow " +
+		"TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets " +
+		"TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv " +
+		"TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect " +
+		"TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd " +
+		"TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq " +
+		"TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge " +
+		"TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index b1e39c82f..d203cebd4 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -72,7 +72,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames
 		"sys":     newSysDir(root, inoGen, k),
 		"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
 		"mounts":  kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
-		"net":     newNetDir(root, inoGen, k),
+		"net":     kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"),
 		"stat":    newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}),
 		"uptime":  newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
 		"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}),
diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go
deleted file mode 100644
index d4e1812d8..000000000
--- a/pkg/sentry/fsimpl/proc/tasks_net.go
+++ /dev/null
@@ -1,787 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-	"io"
-	"reflect"
-	"time"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
-	"gvisor.dev/gvisor/pkg/sentry/inet"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/socket"
-	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
-	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry {
-	var contents map[string]*kernfs.Dentry
-	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
-	// network namespace of the calling process. We should make this per-process,
-	// a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net.
-	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
-		const (
-			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
-			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
-			packet    = "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n"
-			protocols = "protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"
-			ptype     = "Type Device      Function\n"
-			upd6      = "  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"
-		)
-		psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))
-
-		contents = map[string]*kernfs.Dentry{
-			"dev":  newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}),
-			"snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}),
-
-			// The following files are simple stubs until they are implemented in
-			// netstack, if the file contains a header the stub is just the header
-			// otherwise it is an empty file.
-			"arp":       newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)),
-			"netlink":   newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)),
-			"netstat":   newDentry(root, inoGen.NextIno(), 0444, &netStatData{}),
-			"packet":    newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)),
-			"protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)),
-
-			// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
-			// high res timer ticks per sec (ClockGetres returns 1ns resolution).
-			"psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)),
-			"ptype":  newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)),
-			"route":  newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}),
-			"tcp":    newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}),
-			"udp":    newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}),
-			"unix":   newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}),
-		}
-
-		if stack.SupportsIPv6() {
-			contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack})
-			contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(""))
-			contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k})
-			contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6))
-		}
-	}
-
-	return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents)
-}
-
-// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
-//
-// +stateify savable
-type ifinet6 struct {
-	kernfs.DynamicBytesFile
-
-	stack inet.Stack
-}
-
-var _ dynamicInode = (*ifinet6)(nil)
-
-func (n *ifinet6) contents() []string {
-	var lines []string
-	nics := n.stack.Interfaces()
-	for id, naddrs := range n.stack.InterfaceAddrs() {
-		nic, ok := nics[id]
-		if !ok {
-			// NIC was added after NICNames was called. We'll just ignore it.
-			continue
-		}
-
-		for _, a := range naddrs {
-			// IPv6 only.
-			if a.Family != linux.AF_INET6 {
-				continue
-			}
-
-			// Fields:
-			// IPv6 address displayed in 32 hexadecimal chars without colons
-			// Netlink device number (interface index) in hexadecimal (use nic id)
-			// Prefix length in hexadecimal
-			// Scope value (use 0)
-			// Interface flags
-			// Device name
-			lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
-		}
-	}
-	return lines
-}
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	for _, l := range n.contents() {
-		buf.WriteString(l)
-	}
-	return nil
-}
-
-// netDevData implements vfs.DynamicBytesSource for /proc/net/dev.
-//
-// +stateify savable
-type netDevData struct {
-	kernfs.DynamicBytesFile
-
-	stack inet.Stack
-}
-
-var _ dynamicInode = (*netDevData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	interfaces := n.stack.Interfaces()
-	buf.WriteString("Inter-|   Receive                                                |  Transmit\n")
-	buf.WriteString(" face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n")
-
-	for _, i := range interfaces {
-		// Implements the same format as
-		// net/core/net-procfs.c:dev_seq_printf_stats.
-		var stats inet.StatDev
-		if err := n.stack.Statistics(&stats, i.Name); err != nil {
-			log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
-			continue
-		}
-		fmt.Fprintf(
-			buf,
-			"%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
-			i.Name,
-			// Received
-			stats[0], // bytes
-			stats[1], // packets
-			stats[2], // errors
-			stats[3], // dropped
-			stats[4], // fifo
-			stats[5], // frame
-			stats[6], // compressed
-			stats[7], // multicast
-			// Transmitted
-			stats[8],  // bytes
-			stats[9],  // packets
-			stats[10], // errors
-			stats[11], // dropped
-			stats[12], // fifo
-			stats[13], // frame
-			stats[14], // compressed
-			stats[15], // multicast
-		)
-	}
-
-	return nil
-}
-
-// netUnixData implements vfs.DynamicBytesSource for /proc/net/unix.
-//
-// +stateify savable
-type netUnixData struct {
-	kernfs.DynamicBytesFile
-
-	kernel *kernel.Kernel
-}
-
-var _ dynamicInode = (*netUnixData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
-	for _, se := range n.kernel.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
-			continue
-		}
-		sfile := s.(*fs.File)
-		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
-			s.DecRef()
-			// Not a unix socket.
-			continue
-		}
-		sops := sfile.FileOperations.(*unix.SocketOperations)
-
-		addr, err := sops.Endpoint().GetLocalAddress()
-		if err != nil {
-			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
-			addr.Addr = "<unknown>"
-		}
-
-		sockFlags := 0
-		if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
-			if ce.Listening() {
-				// For unix domain sockets, linux reports a single flag
-				// value if the socket is listening, of __SO_ACCEPTCON.
-				sockFlags = linux.SO_ACCEPTCON
-			}
-		}
-
-		// In the socket entry below, the value for the 'Num' field requires
-		// some consideration. Linux prints the address to the struct
-		// unix_sock representing a socket in the kernel, but may redact the
-		// value for unprivileged users depending on the kptr_restrict
-		// sysctl.
-		//
-		// One use for this field is to allow a privileged user to
-		// introspect into the kernel memory to determine information about
-		// a socket not available through procfs, such as the socket's peer.
-		//
-		// In gvisor, returning a pointer to our internal structures would
-		// be pointless, as it wouldn't match the memory layout for struct
-		// unix_sock, making introspection difficult. We could populate a
-		// struct unix_sock with the appropriate data, but even that
-		// requires consideration for which kernel version to emulate, as
-		// the definition of this struct changes over time.
-		//
-		// For now, we always redact this pointer.
-		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
-			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
-			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
-			0,                             // Protocol, always 0 for UDS.
-			sockFlags,                     // Flags.
-			sops.Endpoint().Type(),        // Type.
-			sops.State(),                  // State.
-			sfile.InodeID(),               // Inode.
-		)
-
-		// Path
-		if len(addr.Addr) != 0 {
-			if addr.Addr[0] == 0 {
-				// Abstract path.
-				fmt.Fprintf(buf, " @%s", string(addr.Addr[1:]))
-			} else {
-				fmt.Fprintf(buf, " %s", string(addr.Addr))
-			}
-		}
-		fmt.Fprintf(buf, "\n")
-
-		s.DecRef()
-	}
-	return nil
-}
-
-func networkToHost16(n uint16) uint16 {
-	// n is in network byte order, so is big-endian. The most-significant byte
-	// should be stored in the lower address.
-	//
-	// We manually inline binary.BigEndian.Uint16() because Go does not support
-	// non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to
-	// binary.BigEndian.Uint16() require a read of binary.BigEndian and an
-	// interface method call, defeating inlining.
-	buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)}
-	return usermem.ByteOrder.Uint16(buf[:])
-}
-
-func writeInetAddr(w io.Writer, family int, i linux.SockAddr) {
-	switch family {
-	case linux.AF_INET:
-		var a linux.SockAddrInet
-		if i != nil {
-			a = *i.(*linux.SockAddrInet)
-		}
-
-		// linux.SockAddrInet.Port is stored in the network byte order and is
-		// printed like a number in host byte order. Note that all numbers in host
-		// byte order are printed with the most-significant byte first when
-		// formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux.
-		port := networkToHost16(a.Port)
-
-		// linux.SockAddrInet.Addr is stored as a byte slice in big-endian order
-		// (i.e. most-significant byte in index 0). Linux represents this as a
-		// __be32 which is a typedef for an unsigned int, and is printed with
-		// %X. This means that for a little-endian machine, Linux prints the
-		// least-significant byte of the address first. To emulate this, we first
-		// invert the byte order for the address using usermem.ByteOrder.Uint32,
-		// which makes it have the equivalent encoding to a __be32 on a little
-		// endian machine. Note that this operation is a no-op on a big endian
-		// machine. Then similar to Linux, we format it with %X, which will print
-		// the most-significant byte of the __be32 address first, which is now
-		// actually the least-significant byte of the original address in
-		// linux.SockAddrInet.Addr on little endian machines, due to the conversion.
-		addr := usermem.ByteOrder.Uint32(a.Addr[:])
-
-		fmt.Fprintf(w, "%08X:%04X ", addr, port)
-	case linux.AF_INET6:
-		var a linux.SockAddrInet6
-		if i != nil {
-			a = *i.(*linux.SockAddrInet6)
-		}
-
-		port := networkToHost16(a.Port)
-		addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4])
-		addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8])
-		addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12])
-		addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16])
-		fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port)
-	}
-}
-
-func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error {
-	// t may be nil here if our caller is not part of a task goroutine. This can
-	// happen for example if we're here for "sentryctl cat". When t is nil,
-	// degrade gracefully and retrieve what we can.
-	t := kernel.TaskFromContext(ctx)
-
-	for _, se := range k.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
-			continue
-		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
-		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
-		}
-		if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
-			s.DecRef()
-			// Not tcp4 sockets.
-			continue
-		}
-
-		// Linux's documentation for the fields below can be found at
-		// https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
-		// For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
-		// Note that the header doesn't contain labels for all the fields.
-
-		// Field: sl; entry number.
-		fmt.Fprintf(buf, "%4d: ", se.ID)
-
-		// Field: local_adddress.
-		var localAddr linux.SockAddr
-		if t != nil {
-			if local, _, err := sops.GetSockName(t); err == nil {
-				localAddr = local
-			}
-		}
-		writeInetAddr(buf, family, localAddr)
-
-		// Field: rem_address.
-		var remoteAddr linux.SockAddr
-		if t != nil {
-			if remote, _, err := sops.GetPeerName(t); err == nil {
-				remoteAddr = remote
-			}
-		}
-		writeInetAddr(buf, family, remoteAddr)
-
-		// Field: state; socket state.
-		fmt.Fprintf(buf, "%02X ", sops.State())
-
-		// Field: tx_queue, rx_queue; number of packets in the transmit and
-		// receive queue. Unimplemented.
-		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
-
-		// Field: tr, tm->when; timer active state and number of jiffies
-		// until timer expires. Unimplemented.
-		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
-
-		// Field: retrnsmt; number of unrecovered RTO timeouts.
-		// Unimplemented.
-		fmt.Fprintf(buf, "%08X ", 0)
-
-		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
-			fmt.Fprintf(buf, "%5d ", 0)
-		} else {
-			creds := auth.CredentialsFromContext(ctx)
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
-		}
-
-		// Field: timeout; number of unanswered 0-window probes.
-		// Unimplemented.
-		fmt.Fprintf(buf, "%8d ", 0)
-
-		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
-
-		// Field: refcount. Don't count the ref we obtain while deferencing
-		// the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
-
-		// Field: Socket struct address. Redacted due to the same reason as
-		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
-		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
-
-		// Field: retransmit timeout. Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: predicted tick of soft clock (delayed ACK control data).
-		// Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: sending congestion window, Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
-		// Unimplemented, report as large threshold.
-		fmt.Fprintf(buf, "%d", -1)
-
-		fmt.Fprintf(buf, "\n")
-
-		s.DecRef()
-	}
-
-	return nil
-}
-
-// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp.
-//
-// +stateify savable
-type netTCPData struct {
-	kernfs.DynamicBytesFile
-
-	kernel *kernel.Kernel
-}
-
-var _ dynamicInode = (*netTCPData)(nil)
-
-func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
-	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET)
-}
-
-// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6.
-//
-// +stateify savable
-type netTCP6Data struct {
-	kernfs.DynamicBytesFile
-
-	kernel *kernel.Kernel
-}
-
-var _ dynamicInode = (*netTCP6Data)(nil)
-
-func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	buf.WriteString("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n")
-	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6)
-}
-
-// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp.
-//
-// +stateify savable
-type netUDPData struct {
-	kernfs.DynamicBytesFile
-
-	kernel *kernel.Kernel
-}
-
-var _ dynamicInode = (*netUDPData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	// t may be nil here if our caller is not part of a task goroutine. This can
-	// happen for example if we're here for "sentryctl cat". When t is nil,
-	// degrade gracefully and retrieve what we can.
-	t := kernel.TaskFromContext(ctx)
-
-	for _, se := range d.kernel.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID)
-			continue
-		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
-		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
-		}
-		if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
-			s.DecRef()
-			// Not udp4 socket.
-			continue
-		}
-
-		// For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock().
-
-		// Field: sl; entry number.
-		fmt.Fprintf(buf, "%5d: ", se.ID)
-
-		// Field: local_adddress.
-		var localAddr linux.SockAddrInet
-		if t != nil {
-			if local, _, err := sops.GetSockName(t); err == nil {
-				localAddr = *local.(*linux.SockAddrInet)
-			}
-		}
-		writeInetAddr(buf, linux.AF_INET, &localAddr)
-
-		// Field: rem_address.
-		var remoteAddr linux.SockAddrInet
-		if t != nil {
-			if remote, _, err := sops.GetPeerName(t); err == nil {
-				remoteAddr = *remote.(*linux.SockAddrInet)
-			}
-		}
-		writeInetAddr(buf, linux.AF_INET, &remoteAddr)
-
-		// Field: state; socket state.
-		fmt.Fprintf(buf, "%02X ", sops.State())
-
-		// Field: tx_queue, rx_queue; number of packets in the transmit and
-		// receive queue. Unimplemented.
-		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
-
-		// Field: tr, tm->when. Always 0 for UDP.
-		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
-
-		// Field: retrnsmt. Always 0 for UDP.
-		fmt.Fprintf(buf, "%08X ", 0)
-
-		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
-			fmt.Fprintf(buf, "%5d ", 0)
-		} else {
-			creds := auth.CredentialsFromContext(ctx)
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow()))
-		}
-
-		// Field: timeout. Always 0 for UDP.
-		fmt.Fprintf(buf, "%8d ", 0)
-
-		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
-
-		// Field: ref; reference count on the socket inode. Don't count the ref
-		// we obtain while deferencing the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
-
-		// Field: Socket struct address. Redacted due to the same reason as
-		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
-		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
-
-		// Field: drops; number of dropped packets. Unimplemented.
-		fmt.Fprintf(buf, "%d", 0)
-
-		fmt.Fprintf(buf, "\n")
-
-		s.DecRef()
-	}
-	return nil
-}
-
-// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp.
-//
-// +stateify savable
-type netSnmpData struct {
-	kernfs.DynamicBytesFile
-
-	stack inet.Stack
-}
-
-var _ dynamicInode = (*netSnmpData)(nil)
-
-type snmpLine struct {
-	prefix string
-	header string
-}
-
-var snmp = []snmpLine{
-	{
-		prefix: "Ip",
-		header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates",
-	},
-	{
-		prefix: "Icmp",
-		header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps",
-	},
-	{
-		prefix: "IcmpMsg",
-	},
-	{
-		prefix: "Tcp",
-		header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors",
-	},
-	{
-		prefix: "Udp",
-		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
-	},
-	{
-		prefix: "UdpLite",
-		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
-	},
-}
-
-func toSlice(a interface{}) []uint64 {
-	v := reflect.Indirect(reflect.ValueOf(a))
-	return v.Slice(0, v.Len()).Interface().([]uint64)
-}
-
-func sprintSlice(s []uint64) string {
-	if len(s) == 0 {
-		return ""
-	}
-	r := fmt.Sprint(s)
-	return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
-}
-
-// Generate implements vfs.DynamicBytesSource.
-func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	types := []interface{}{
-		&inet.StatSNMPIP{},
-		&inet.StatSNMPICMP{},
-		nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats.
-		&inet.StatSNMPTCP{},
-		&inet.StatSNMPUDP{},
-		&inet.StatSNMPUDPLite{},
-	}
-	for i, stat := range types {
-		line := snmp[i]
-		if stat == nil {
-			fmt.Fprintf(buf, "%s:\n", line.prefix)
-			fmt.Fprintf(buf, "%s:\n", line.prefix)
-			continue
-		}
-		if err := d.stack.Statistics(stat, line.prefix); err != nil {
-			if err == syserror.EOPNOTSUPP {
-				log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
-			} else {
-				log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
-			}
-		}
-
-		fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header)
-
-		if line.prefix == "Tcp" {
-			tcp := stat.(*inet.StatSNMPTCP)
-			// "Tcp" needs special processing because MaxConn is signed. RFC 2012.
-			fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
-		} else {
-			fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
-		}
-	}
-	return nil
-}
-
-// netRouteData implements vfs.DynamicBytesSource for /proc/net/route.
-//
-// +stateify savable
-type netRouteData struct {
-	kernfs.DynamicBytesFile
-
-	stack inet.Stack
-}
-
-var _ dynamicInode = (*netRouteData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.
-// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
-func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
-
-	interfaces := d.stack.Interfaces()
-	for _, rt := range d.stack.RouteTable() {
-		// /proc/net/route only includes ipv4 routes.
-		if rt.Family != linux.AF_INET {
-			continue
-		}
-
-		// /proc/net/route does not include broadcast or multicast routes.
-		if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST {
-			continue
-		}
-
-		iface, ok := interfaces[rt.OutputInterface]
-		if !ok || iface.Name == "lo" {
-			continue
-		}
-
-		var (
-			gw     uint32
-			prefix uint32
-			flags  = linux.RTF_UP
-		)
-		if len(rt.GatewayAddr) == header.IPv4AddressSize {
-			flags |= linux.RTF_GATEWAY
-			gw = usermem.ByteOrder.Uint32(rt.GatewayAddr)
-		}
-		if len(rt.DstAddr) == header.IPv4AddressSize {
-			prefix = usermem.ByteOrder.Uint32(rt.DstAddr)
-		}
-		l := fmt.Sprintf(
-			"%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d",
-			iface.Name,
-			prefix,
-			gw,
-			flags,
-			0, // RefCnt.
-			0, // Use.
-			0, // Metric.
-			(uint32(1)<<rt.DstLen)-1,
-			0, // MTU.
-			0, // Window.
-			0, // RTT.
-		)
-		fmt.Fprintf(buf, "%-127s\n", l)
-	}
-	return nil
-}
-
-// netStatData implements vfs.DynamicBytesSource for /proc/net/netstat.
-//
-// +stateify savable
-type netStatData struct {
-	kernfs.DynamicBytesFile
-
-	stack inet.Stack
-}
-
-var _ dynamicInode = (*netStatData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.
-// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
-func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
-		"EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps " +
-		"LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive " +
-		"PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost " +
-		"ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog " +
-		"TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser " +
-		"TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging " +
-		"TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo " +
-		"TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit " +
-		"TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans " +
-		"TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes " +
-		"TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail " +
-		"TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent " +
-		"TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose " +
-		"TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed " +
-		"TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld " +
-		"TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected " +
-		"TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback " +
-		"TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter " +
-		"TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail " +
-		"TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK " +
-		"TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail " +
-		"TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow " +
-		"TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets " +
-		"TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv " +
-		"TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect " +
-		"TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd " +
-		"TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq " +
-		"TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge " +
-		"TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 0eb401619..1bb9430c0 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -50,7 +50,7 @@ var (
 		"loadavg":     linux.DT_REG,
 		"meminfo":     linux.DT_REG,
 		"mounts":      linux.DT_LNK,
-		"net":         linux.DT_DIR,
+		"net":         linux.DT_LNK,
 		"self":        linux.DT_LNK,
 		"stat":        linux.DT_REG,
 		"sys":         linux.DT_DIR,
@@ -71,6 +71,7 @@ var (
 		"gid_map":       linux.DT_REG,
 		"io":            linux.DT_REG,
 		"maps":          linux.DT_REG,
+		"net":           linux.DT_DIR,
 		"ns":            linux.DT_DIR,
 		"oom_score":     linux.DT_REG,
 		"oom_score_adj": linux.DT_REG,
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 3a611a86f..05c952b99 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -33,6 +33,31 @@ namespace gvisor {
 namespace testing {
 namespace {
 
+constexpr const char kProcNet[] = "/proc/net";
+
+TEST(ProcNetSymlinkTarget, FileMode) {
+  struct stat s;
+  ASSERT_THAT(stat(kProcNet, &s), SyscallSucceeds());
+  EXPECT_EQ(s.st_mode & S_IFMT, S_IFDIR);
+  EXPECT_EQ(s.st_mode & 0777, 0555);
+}
+
+TEST(ProcNetSymlink, FileMode) {
+  struct stat s;
+  ASSERT_THAT(lstat(kProcNet, &s), SyscallSucceeds());
+  EXPECT_EQ(s.st_mode & S_IFMT, S_IFLNK);
+  EXPECT_EQ(s.st_mode & 0777, 0777);
+}
+
+TEST(ProcNetSymlink, Contents) {
+  char buf[40] = {};
+  int n = readlink(kProcNet, buf, sizeof(buf));
+  ASSERT_THAT(n, SyscallSucceeds());
+
+  buf[n] = 0;
+  EXPECT_STREQ(buf, "self/net");
+}
+
 TEST(ProcNetIfInet6, Format) {
   auto ifinet6 = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/if_inet6"));
   EXPECT_THAT(ifinet6,
-- 
cgit v1.2.3


From bbf86003bfd2a7547744b89c72e1cd06e9385e66 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 12 Mar 2020 14:34:16 -0700
Subject: Remove flaky network namespace test that uses clone().

PiperOrigin-RevId: 300626011
---
 test/syscalls/linux/BUILD                |  3 +-
 test/syscalls/linux/network_namespace.cc | 87 ++++----------------------------
 2 files changed, 10 insertions(+), 80 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 43455f1a3..636e5db12 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3707,11 +3707,10 @@ cc_binary(
         ":socket_test_util",
         gtest,
         "//test/util:capability_util",
-        "//test/util:memory_util",
+        "//test/util:posix_error",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
-        "@com_google_absl//absl/synchronization",
     ],
 )
 
diff --git a/test/syscalls/linux/network_namespace.cc b/test/syscalls/linux/network_namespace.cc
index 6ea48c263..133fdecf0 100644
--- a/test/syscalls/linux/network_namespace.cc
+++ b/test/syscalls/linux/network_namespace.cc
@@ -20,102 +20,33 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "absl/synchronization/notification.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/capability_util.h"
-#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
 namespace gvisor {
 namespace testing {
-
 namespace {
 
-using TestFunc = std::function<PosixError()>;
-using RunFunc = std::function<PosixError(TestFunc)>;
-
-struct NamespaceStrategy {
-  RunFunc run;
-
-  static NamespaceStrategy Of(RunFunc run) {
-    NamespaceStrategy s;
-    s.run = run;
-    return s;
-  }
-};
+TEST(NetworkNamespaceTest, LoopbackExists) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
 
-PosixError RunWithUnshare(TestFunc fn) {
-  PosixError err = PosixError(-1, "function did not return a value");
   ScopedThread t([&] {
-    if (unshare(CLONE_NEWNET) != 0) {
-      err = PosixError(errno);
-      return;
-    }
-    err = fn();
-  });
-  t.Join();
-  return err;
-}
+    ASSERT_THAT(unshare(CLONE_NEWNET), SyscallSucceedsWithValue(0));
 
-PosixError RunWithClone(TestFunc fn) {
-  struct Args {
-    absl::Notification n;
-    TestFunc fn;
-    PosixError err;
-  };
-  Args args;
-  args.fn = fn;
-  args.err = PosixError(-1, "function did not return a value");
-
-  ASSIGN_OR_RETURN_ERRNO(
-      Mapping child_stack,
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  pid_t child = clone(
-      +[](void *arg) {
-        Args *args = reinterpret_cast<Args *>(arg);
-        args->err = args->fn();
-        args->n.Notify();
-        syscall(SYS_exit, 0);  // Exit manually. No return address on stack.
-        return 0;
-      },
-      reinterpret_cast<void *>(child_stack.addr() + kPageSize),
-      CLONE_NEWNET | CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, &args);
-  if (child < 0) {
-    return PosixError(errno, "clone() failed");
-  }
-  args.n.WaitForNotification();
-  return args.err;
-}
-
-class NetworkNamespaceTest
-    : public ::testing::TestWithParam<NamespaceStrategy> {};
-
-TEST_P(NetworkNamespaceTest, LoopbackExists) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
-
-  EXPECT_NO_ERRNO(GetParam().run([]() {
     // TODO(gvisor.dev/issue/1833): Update this to test that only "lo" exists.
     // Check loopback device exists.
     int sock = socket(AF_INET, SOCK_DGRAM, 0);
-    if (sock < 0) {
-      return PosixError(errno, "socket() failed");
-    }
+    ASSERT_THAT(sock, SyscallSucceeds());
     struct ifreq ifr;
-    snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
-    if (ioctl(sock, SIOCGIFINDEX, &ifr) < 0) {
-      return PosixError(errno, "ioctl() failed, lo cannot be found");
-    }
-    return NoError();
-  }));
+    strncpy(ifr.ifr_name, "lo", IFNAMSIZ);
+    EXPECT_THAT(ioctl(sock, SIOCGIFINDEX, &ifr), SyscallSucceeds())
+        << "lo cannot be found";
+  });
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    AllNetworkNamespaceTest, NetworkNamespaceTest,
-    ::testing::Values(NamespaceStrategy::Of(RunWithUnshare),
-                      NamespaceStrategy::Of(RunWithClone)));
-
 }  // namespace
-
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 333b74dc288357e192dbd86f6d0732be5ea7df64 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 13 Mar 2020 03:02:26 +0000
Subject: Enable syscall seccomp test on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: Ibc926c917d98b31fc92bbf8d82d6818c39b0f93c
---
 test/syscalls/linux/seccomp.cc | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 8e0fc9acc..06cc6a64e 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -72,8 +72,15 @@ void ApplySeccompFilter(uint32_t sysno, uint32_t filtered_result,
   struct sock_filter filter[] = {
       // A = seccomp_data.arch
       BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 4),
+#if defined(__x86_64__)
       // if (A != AUDIT_ARCH_X86_64) goto kill
       BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 4),
+#elif defined(__aarch64__)
+      // if (A != AUDIT_ARCH_AARCH64) goto kill
+      BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_AARCH64, 0, 4),
+#else
+#error "Unknown architecture"
+#endif
       // A = seccomp_data.nr
       BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0),
       // if (A != sysno) goto allow
@@ -179,9 +186,12 @@ TEST(SeccompTest, RetTrapCausesSIGSYS) {
           TEST_CHECK(info->si_errno == kTrapValue);
           TEST_CHECK(info->si_call_addr != nullptr);
           TEST_CHECK(info->si_syscall == kFilteredSyscall);
-#ifdef __x86_64__
+#if defined(__x86_64__)
           TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
           TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == kFilteredSyscall);
+#elif defined(__aarch64__)
+          TEST_CHECK(info->si_arch == AUDIT_ARCH_AARCH64);
+          TEST_CHECK(uc->uc_mcontext.regs[8] == kFilteredSyscall);
 #endif  // defined(__x86_64__)
           _exit(0);
         });
-- 
cgit v1.2.3


From 5e413cad10d2358a21dd08216953faee70e62a0b Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Sat, 14 Mar 2020 07:13:15 -0700
Subject: Plumb VFS2 imported fds into virtual filesystem.

- When setting up the virtual filesystem, mount a host.filesystem to contain
  all files that need to be imported.
- Make read/preadv syscalls to the host in cases where preadv2 may not be
  supported yet (likewise for writing).
- Make save/restore functions in kernel/kernel.go return early if vfs2 is
  enabled.

PiperOrigin-RevId: 300922353
---
 pkg/abi/linux/file.go                  |   3 +
 pkg/sentry/fs/host/control.go          |   2 +
 pkg/sentry/fsimpl/host/BUILD           |   2 +
 pkg/sentry/fsimpl/host/default_file.go |  45 +++++++-----
 pkg/sentry/fsimpl/host/host.go         | 124 ++++++++++++++++++++++++++++++---
 pkg/sentry/fsimpl/host/util.go         |  28 ++------
 pkg/sentry/kernel/kernel.go            |  40 +++++++----
 pkg/sentry/syscalls/linux/sys_stat.go  |   5 +-
 pkg/sentry/syscalls/linux/vfs2/stat.go |   6 +-
 runsc/boot/filter/config.go            |   1 +
 test/syscalls/linux/stat.cc            |  60 ++++++++++++++--
 11 files changed, 246 insertions(+), 70 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index e229ac21c..dbe58acbe 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -266,6 +266,9 @@ type Statx struct {
 	DevMinor       uint32
 }
 
+// SizeOfStatx is the size of a Statx struct.
+var SizeOfStatx = binary.Size(Statx{})
+
 // FileMode represents a mode_t.
 type FileMode uint16
 
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index 1658979fc..cd84e1337 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -32,6 +32,8 @@ func newSCMRights(fds []int) control.SCMRights {
 }
 
 // Files implements control.SCMRights.Files.
+//
+// TODO(gvisor.dev/issue/2017): Port to VFS2.
 func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) {
 	n := max
 	var trunc bool
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index 731f192b3..5d67f88e3 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -9,9 +9,11 @@ go_library(
         "host.go",
         "util.go",
     ],
+    visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/fd",
         "//pkg/log",
         "//pkg/refs",
         "//pkg/safemem",
diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go
index 172cdb161..98682ba5e 100644
--- a/pkg/sentry/fsimpl/host/default_file.go
+++ b/pkg/sentry/fsimpl/host/default_file.go
@@ -21,6 +21,7 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -64,9 +65,7 @@ func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts v
 			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
 		}
 
-		f.mu.Lock()
 		n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags))
-		f.mu.Unlock()
 		if isBlockError(err) {
 			// If we got any data at all, return it as a "completed" partial read
 			// rather than retrying until complete.
@@ -86,16 +85,22 @@ func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts v
 	return n, err
 }
 
-func readFromHostFD(ctx context.Context, fd int, dst usermem.IOSequence, offset int64, flags int) (int64, error) {
-	if flags&^(linux.RWF_VALID) != 0 {
+func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags int) (int64, error) {
+	// TODO(gvisor.dev/issue/1672): Support select preadv2 flags.
+	if flags != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 
-	reader := safemem.FromVecReaderFunc{
-		func(srcs [][]byte) (int64, error) {
-			n, err := unix.Preadv2(fd, srcs, offset, flags)
-			return int64(n), err
-		},
+	var reader safemem.Reader
+	if offset == -1 {
+		reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)}
+	} else {
+		reader = safemem.FromVecReaderFunc{
+			func(srcs [][]byte) (int64, error) {
+				n, err := unix.Preadv(hostFD, srcs, offset)
+				return int64(n), err
+			},
+		}
 	}
 	n, err := dst.CopyOutFrom(ctx, reader)
 	return int64(n), err
@@ -120,9 +125,7 @@ func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts
 			panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
 		}
 
-		f.mu.Lock()
 		n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags))
-		f.mu.Unlock()
 		if isBlockError(err) {
 			err = syserror.ErrWouldBlock
 		}
@@ -137,16 +140,22 @@ func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts
 	return n, err
 }
 
-func writeToHostFD(ctx context.Context, fd int, src usermem.IOSequence, offset int64, flags int) (int64, error) {
-	if flags&^(linux.RWF_VALID) != 0 {
+func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags int) (int64, error) {
+	// TODO(gvisor.dev/issue/1672): Support select pwritev2 flags.
+	if flags != 0 {
 		return 0, syserror.EOPNOTSUPP
 	}
 
-	writer := safemem.FromVecWriterFunc{
-		func(srcs [][]byte) (int64, error) {
-			n, err := unix.Pwritev2(fd, srcs, offset, flags)
-			return int64(n), err
-		},
+	var writer safemem.Writer
+	if offset == -1 {
+		writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)}
+	} else {
+		writer = safemem.FromVecWriterFunc{
+			func(srcs [][]byte) (int64, error) {
+				n, err := unix.Pwritev(hostFD, srcs, offset)
+				return int64(n), err
+			},
+		}
 	}
 	n, err := src.CopyInTo(ctx, writer)
 	return int64(n), err
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index c205e6a0b..0be812d13 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -38,10 +38,19 @@ type filesystem struct {
 	kernfs.Filesystem
 }
 
+// NewMount returns a new disconnected mount in vfsObj that may be passed to ImportFD.
+func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) {
+	fs := &filesystem{}
+	fs.Init(vfsObj)
+	vfsfs := fs.VFSFilesystem()
+	// NewDisconnectedMount will take an additional reference on vfsfs.
+	defer vfsfs.DecRef()
+	return vfsObj.NewDisconnectedMount(vfsfs, nil, &vfs.MountOptions{})
+}
+
 // ImportFD sets up and returns a vfs.FileDescription from a donated fd.
 func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) {
-	// Must be importing to a mount of host.filesystem.
-	fs, ok := mnt.Filesystem().Impl().(*filesystem)
+	fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem)
 	if !ok {
 		return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
 	}
@@ -54,8 +63,7 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID
 
 	fileMode := linux.FileMode(s.Mode)
 	fileType := fileMode.FileType()
-	// Pipes, character devices, and sockets can return EWOULDBLOCK for
-	// operations that would block.
+	// Pipes, character devices, and sockets.
 	isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
 
 	i := &inode{
@@ -143,11 +151,109 @@ func (i *inode) Mode() linux.FileMode {
 
 // Stat implements kernfs.Inode.
 func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	if opts.Mask&linux.STATX__RESERVED != 0 {
+		return linux.Statx{}, syserror.EINVAL
+	}
+	if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
+		return linux.Statx{}, syserror.EINVAL
+	}
+
+	// Limit our host call only to known flags.
+	mask := opts.Mask & linux.STATX_ALL
 	var s unix.Statx_t
-	if err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(opts.Mask), &s); err != nil {
+	err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s)
+	// Fallback to fstat(2), if statx(2) is not supported on the host.
+	//
+	// TODO(b/151263641): Remove fallback.
+	if err == syserror.ENOSYS {
+		return i.fstat(opts)
+	} else if err != nil {
+		return linux.Statx{}, err
+	}
+
+	ls := linux.Statx{Mask: mask}
+	// Unconditionally fill blksize, attributes, and device numbers, as indicated
+	// by /include/uapi/linux/stat.h.
+	//
+	// RdevMajor/RdevMinor are left as zero, so as not to expose host device
+	// numbers.
+	//
+	// TODO(gvisor.dev/issue/1672): Use kernfs-specific, internally defined
+	// device numbers. If we use the device number from the host, it may collide
+	// with another sentry-internal device number. We handle device/inode
+	// numbers without relying on the host to prevent collisions.
+	ls.Blksize = s.Blksize
+	ls.Attributes = s.Attributes
+	ls.AttributesMask = s.Attributes_mask
+
+	if mask|linux.STATX_TYPE != 0 {
+		ls.Mode |= s.Mode & linux.S_IFMT
+	}
+	if mask|linux.STATX_MODE != 0 {
+		ls.Mode |= s.Mode &^ linux.S_IFMT
+	}
+	if mask|linux.STATX_NLINK != 0 {
+		ls.Nlink = s.Nlink
+	}
+	if mask|linux.STATX_ATIME != 0 {
+		ls.Atime = unixToLinuxStatxTimestamp(s.Atime)
+	}
+	if mask|linux.STATX_BTIME != 0 {
+		ls.Btime = unixToLinuxStatxTimestamp(s.Btime)
+	}
+	if mask|linux.STATX_CTIME != 0 {
+		ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime)
+	}
+	if mask|linux.STATX_MTIME != 0 {
+		ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime)
+	}
+	if mask|linux.STATX_SIZE != 0 {
+		ls.Size = s.Size
+	}
+	if mask|linux.STATX_BLOCKS != 0 {
+		ls.Blocks = s.Blocks
+	}
+
+	// Use our own internal inode number and file owner.
+	if mask|linux.STATX_INO != 0 {
+		ls.Ino = i.ino
+	}
+	if mask|linux.STATX_UID != 0 {
+		ls.UID = uint32(i.uid)
+	}
+	if mask|linux.STATX_GID != 0 {
+		ls.GID = uint32(i.gid)
+	}
+
+	return ls, nil
+}
+
+// fstat is a best-effort fallback for inode.Stat() if the host does not
+// support statx(2).
+//
+// We ignore the mask and sync flags in opts and simply supply
+// STATX_BASIC_STATS, as fstat(2) itself does not allow the specification
+// of a mask or sync flags. fstat(2) does not provide any metadata
+// equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so
+// those fields remain empty.
+func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) {
+	var s unix.Stat_t
+	if err := unix.Fstat(i.hostFD, &s); err != nil {
 		return linux.Statx{}, err
 	}
-	ls := unixToLinuxStatx(s)
+
+	// Note that rdev numbers are left as 0; do not expose host device numbers.
+	ls := linux.Statx{
+		Mask:    linux.STATX_BASIC_STATS,
+		Blksize: uint32(s.Blksize),
+		Nlink:   uint32(s.Nlink),
+		Mode:    uint16(s.Mode),
+		Size:    uint64(s.Size),
+		Blocks:  uint64(s.Blocks),
+		Atime:   timespecToStatxTimestamp(s.Atim),
+		Ctime:   timespecToStatxTimestamp(s.Ctim),
+		Mtime:   timespecToStatxTimestamp(s.Mtim),
+	}
 
 	// Use our own internal inode number and file owner.
 	//
@@ -159,9 +265,6 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro
 	ls.UID = uint32(i.uid)
 	ls.GID = uint32(i.gid)
 
-	// Update file mode from the host.
-	i.mode = linux.FileMode(ls.Mode)
-
 	return ls, nil
 }
 
@@ -217,7 +320,6 @@ func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptio
 }
 
 func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) {
-
 	fileType := i.mode.FileType()
 	if fileType == syscall.S_IFSOCK {
 		if i.isTTY {
@@ -227,6 +329,8 @@ func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error
 		return nil, errors.New("importing host sockets not supported")
 	}
 
+	// TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that
+	// we don't allow importing arbitrary file types without proper support.
 	if i.isTTY {
 		// TODO(gvisor.dev/issue/1672): support importing host fd as TTY.
 		return nil, errors.New("importing host fd as TTY not supported")
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
index e1ccacb4d..d519feef5 100644
--- a/pkg/sentry/fsimpl/host/util.go
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -35,34 +35,14 @@ func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec {
 	}
 }
 
-func unixToLinuxStatx(s unix.Statx_t) linux.Statx {
-	return linux.Statx{
-		Mask:           s.Mask,
-		Blksize:        s.Blksize,
-		Attributes:     s.Attributes,
-		Nlink:          s.Nlink,
-		UID:            s.Uid,
-		GID:            s.Gid,
-		Mode:           s.Mode,
-		Ino:            s.Ino,
-		Size:           s.Size,
-		Blocks:         s.Blocks,
-		AttributesMask: s.Attributes_mask,
-		Atime:          unixToLinuxStatxTimestamp(s.Atime),
-		Btime:          unixToLinuxStatxTimestamp(s.Btime),
-		Ctime:          unixToLinuxStatxTimestamp(s.Ctime),
-		Mtime:          unixToLinuxStatxTimestamp(s.Mtime),
-		RdevMajor:      s.Rdev_major,
-		RdevMinor:      s.Rdev_minor,
-		DevMajor:       s.Dev_major,
-		DevMinor:       s.Dev_minor,
-	}
-}
-
 func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp {
 	return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec}
 }
 
+func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp {
+	return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)}
+}
+
 // wouldBlock returns true for file types that can return EWOULDBLOCK
 // for blocking operations, e.g. pipes, character devices, and sockets.
 func wouldBlock(fileType uint32) bool {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 1d627564f..6feda8fa1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -467,6 +467,11 @@ func (k *Kernel) flushMountSourceRefs() error {
 //
 // Precondition: Must be called with the kernel paused.
 func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) {
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+	if VFS2Enabled {
+		return nil
+	}
+
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
@@ -484,7 +489,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error)
 }
 
 func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
-	// TODO(gvisor.dev/issues/1663): Add save support for VFS2.
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
 	return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
 		if flags := file.Flags(); !flags.Write {
 			return nil
@@ -533,6 +538,11 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
 }
 
 func (ts *TaskSet) unregisterEpollWaiters() {
+	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+	if VFS2Enabled {
+		return
+	}
+
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
@@ -1005,11 +1015,14 @@ func (k *Kernel) pauseTimeLocked() {
 		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
 		// but ktime.Timer.Pause is idempotent so this is harmless.
 		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
-					tfd.PauseTimer()
-				}
-			})
+			// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+			if !VFS2Enabled {
+				t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+					if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
+						tfd.PauseTimer()
+					}
+				})
+			}
 		}
 	}
 	k.timekeeper.PauseUpdates()
@@ -1034,12 +1047,15 @@ func (k *Kernel) resumeTimeLocked() {
 				it.ResumeTimer()
 			}
 		}
-		if t.fdTable != nil {
-			t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
-				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
-					tfd.ResumeTimer()
-				}
-			})
+		// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+		if !VFS2Enabled {
+			if t.fdTable != nil {
+				t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+					if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
+						tfd.ResumeTimer()
+					}
+				})
+			}
 		}
 	}
 }
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 9bd2df104..a11a87cd1 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -136,7 +136,10 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	mask := args[3].Uint()
 	statxAddr := args[4].Pointer()
 
-	if mask&linux.STATX__RESERVED > 0 {
+	if mask&linux.STATX__RESERVED != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if flags&^(linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH|linux.AT_STATX_SYNC_TYPE) != 0 {
 		return 0, nil, syserror.EINVAL
 	}
 	if flags&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index a74ea6fd5..97eaedd66 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -150,7 +150,11 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	mask := args[3].Uint()
 	statxAddr := args[4].Pointer()
 
-	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if mask&linux.STATX__RESERVED != 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index a4627905e..f459d1973 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -284,6 +284,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK:     {},
+	unix.SYS_STATX:              {},
 	syscall.SYS_SYNC_FILE_RANGE: {},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index c951ac3b3..513b9cd1c 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -607,7 +607,7 @@ int statx(int dirfd, const char* pathname, int flags, unsigned int mask,
 }
 
 TEST_F(StatTest, StatxAbsPath) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   struct kernel_statx stx;
@@ -617,7 +617,7 @@ TEST_F(StatTest, StatxAbsPath) {
 }
 
 TEST_F(StatTest, StatxRelPathDirFD) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   struct kernel_statx stx;
@@ -631,7 +631,7 @@ TEST_F(StatTest, StatxRelPathDirFD) {
 }
 
 TEST_F(StatTest, StatxRelPathCwd) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
@@ -643,7 +643,7 @@ TEST_F(StatTest, StatxRelPathCwd) {
 }
 
 TEST_F(StatTest, StatxEmptyPath) {
-  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 &&
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
           errno == ENOSYS);
 
   const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
@@ -653,6 +653,58 @@ TEST_F(StatTest, StatxEmptyPath) {
   EXPECT_TRUE(S_ISREG(stx.stx_mode));
 }
 
+TEST_F(StatTest, StatxDoesNotRejectExtraneousMaskBits) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  struct kernel_statx stx;
+  // Set all mask bits except for STATX__RESERVED.
+  uint mask = 0xffffffff & ~0x80000000;
+  EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, mask, &stx),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxRejectsReservedMaskBit) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  struct kernel_statx stx;
+  // Set STATX__RESERVED in the mask.
+  EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, 0x80000000, &stx),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(StatTest, StatxSymlink) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  std::string parent_dir = "/tmp";
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(parent_dir, test_file_name_));
+  std::string p = link.path();
+
+  struct kernel_statx stx;
+  EXPECT_THAT(statx(AT_FDCWD, p.c_str(), AT_SYMLINK_NOFOLLOW, STATX_ALL, &stx),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISLNK(stx.stx_mode));
+  EXPECT_THAT(statx(AT_FDCWD, p.c_str(), 0, STATX_ALL, &stx),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxInvalidFlags) {
+  SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+          errno == ENOSYS);
+
+  struct kernel_statx stx;
+  EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), 12345, 0, &stx),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(),
+                    0x6000 /* AT_STATX_SYNC_TYPE */, 0, &stx),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 69da42885aff9371fd53227583a546df914de02b Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Mon, 16 Mar 2020 12:02:33 -0700
Subject: Enable ARP resolution in TAP devices.

PiperOrigin-RevId: 301208471
---
 pkg/tcpip/link/tun/device.go  |  10 +++-
 test/syscalls/linux/tuntap.cc | 105 +++++++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 29 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
index 6ff47a742..f6e301304 100644
--- a/pkg/tcpip/link/tun/device.go
+++ b/pkg/tcpip/link/tun/device.go
@@ -98,7 +98,12 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
 		prefix = "tap"
 	}
 
-	endpoint, err := attachOrCreateNIC(s, name, prefix)
+	linkCaps := stack.CapabilityNone
+	if isTap {
+		linkCaps |= stack.CapabilityResolutionRequired
+	}
+
+	endpoint, err := attachOrCreateNIC(s, name, prefix, linkCaps)
 	if err != nil {
 		return syserror.EINVAL
 	}
@@ -109,7 +114,7 @@ func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
 	return nil
 }
 
-func attachOrCreateNIC(s *stack.Stack, name, prefix string) (*tunEndpoint, error) {
+func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkEndpointCapabilities) (*tunEndpoint, error) {
 	for {
 		// 1. Try to attach to an existing NIC.
 		if name != "" {
@@ -135,6 +140,7 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string) (*tunEndpoint, error
 			nicID:    id,
 			name:     name,
 		}
+		endpoint.Endpoint.LinkEPCapabilities = linkCaps
 		if endpoint.name == "" {
 			endpoint.name = fmt.Sprintf("%s%d", prefix, id)
 		}
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index f734511d6..53ad2dda3 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -256,50 +256,59 @@ TEST_F(TuntapTest, WriteToDownDevice) {
   EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EIO));
 }
 
-// This test sets up a TAP device and pings kernel by sending ICMP echo request.
-//
-// It works as the following:
-// * Open /dev/net/tun, and create kTapName interface.
-// * Use rtnetlink to do initial setup of the interface:
-//   * Assign IP address 10.0.0.1/24 to kernel.
-//   * MAC address: kMacA
-//   * Bring up the interface.
-// * Send an ICMP echo reqest (ping) packet from 10.0.0.2 (kMacB) to kernel.
-// * Loop to receive packets from TAP device/fd:
-//   * If packet is an ICMP echo reply, it stops and passes the test.
-//   * If packet is an ARP request, it responds with canned reply and resends
-//   the
-//     ICMP request packet.
-TEST_F(TuntapTest, PingKernel) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
-
+PosixErrorOr<FileDescriptor> OpenAndAttachTap(
+    const std::string& dev_name, const std::string& dev_ipv4_addr) {
   // Interface creation.
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, Open(kDevNetTun, O_RDWR));
 
   struct ifreq ifr_set = {};
   ifr_set.ifr_flags = IFF_TAP;
-  strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
-  EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
-              SyscallSucceedsWithValue(0));
+  strncpy(ifr_set.ifr_name, dev_name.c_str(), IFNAMSIZ);
+  if (ioctl(fd.get(), TUNSETIFF, &ifr_set) < 0) {
+    return PosixError(errno);
+  }
 
-  absl::optional<Link> link =
-      ASSERT_NO_ERRNO_AND_VALUE(GetLinkByName(kTapName));
-  ASSERT_TRUE(link.has_value());
+  ASSIGN_OR_RETURN_ERRNO(absl::optional<Link> link, GetLinkByName(dev_name));
+  if (!link.has_value()) {
+    return PosixError(ENOENT, "no link");
+  }
 
   // Interface setup.
   struct in_addr addr;
-  inet_pton(AF_INET, "10.0.0.1", &addr);
+  inet_pton(AF_INET, dev_ipv4_addr.c_str(), &addr);
   EXPECT_NO_ERRNO(LinkAddLocalAddr(link->index, AF_INET, /*prefixlen=*/24,
                                    &addr, sizeof(addr)));
 
   if (!IsRunningOnGvisor()) {
     // FIXME: gVisor doesn't support setting MAC address on interfaces yet.
-    EXPECT_NO_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
+    RETURN_IF_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
 
     // FIXME: gVisor always creates enabled/up'd interfaces.
-    EXPECT_NO_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
+    RETURN_IF_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
   }
 
+  return fd;
+}
+
+// This test sets up a TAP device and pings kernel by sending ICMP echo request.
+//
+// It works as the following:
+// * Open /dev/net/tun, and create kTapName interface.
+// * Use rtnetlink to do initial setup of the interface:
+//   * Assign IP address 10.0.0.1/24 to kernel.
+//   * MAC address: kMacA
+//   * Bring up the interface.
+// * Send an ICMP echo reqest (ping) packet from 10.0.0.2 (kMacB) to kernel.
+// * Loop to receive packets from TAP device/fd:
+//   * If packet is an ICMP echo reply, it stops and passes the test.
+//   * If packet is an ARP request, it responds with canned reply and resends
+//   the
+//     ICMP request packet.
+TEST_F(TuntapTest, PingKernel) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, "10.0.0.1"));
   ping_pkt ping_req = CreatePingPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
   std::string arp_rep = CreateArpPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
 
@@ -349,5 +358,47 @@ TEST_F(TuntapTest, PingKernel) {
   }
 }
 
+TEST_F(TuntapTest, SendUdpTriggersArpResolution) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, "10.0.0.1"));
+
+  // Send a UDP packet to remote.
+  int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+  ASSERT_THAT(sock, SyscallSucceeds());
+
+  struct sockaddr_in remote = {};
+  remote.sin_family = AF_INET;
+  remote.sin_port = htons(42);
+  inet_pton(AF_INET, "10.0.0.2", &remote.sin_addr);
+  int ret = sendto(sock, "hello", 5, 0, reinterpret_cast<sockaddr*>(&remote),
+                   sizeof(remote));
+  ASSERT_THAT(ret, ::testing::AnyOf(SyscallSucceeds(),
+                                    SyscallFailsWithErrno(EHOSTDOWN)));
+
+  struct inpkt {
+    union {
+      pihdr pi;
+      arp_pkt arp;
+    };
+  };
+  while (1) {
+    inpkt r = {};
+    int n = read(fd.get(), &r, sizeof(r));
+    EXPECT_THAT(n, SyscallSucceeds());
+
+    if (n < sizeof(pihdr)) {
+      std::cerr << "Ignored packet, protocol: " << r.pi.pi_protocol
+                << " len: " << n << std::endl;
+      continue;
+    }
+
+    if (n >= sizeof(arp_pkt) && r.pi.pi_protocol == htons(ETH_P_ARP)) {
+      break;
+    }
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 9c35d7eb1f96f12207f78b94722f0e8b778b5af3 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 11 Mar 2020 09:55:07 +0000
Subject: Enable syscall sysret_test on arm64.

Fixes #2058

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I05750d238a6425d3a47fae15720901f4dd924a32
---
 test/syscalls/linux/BUILD     |  5 +----
 test/syscalls/linux/sysret.cc | 32 +++++++++++++++++++++++++++++---
 2 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 636e5db12..d0c431234 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3336,10 +3336,7 @@ cc_binary(
 cc_binary(
     name = "sysret_test",
     testonly = 1,
-    srcs = select_arch(
-        amd64 = ["sysret.cc"],
-        arm64 = [],
-    ),
+    srcs = ["sysret.cc"],
     linkstatic = 1,
     deps = [
         gtest,
diff --git a/test/syscalls/linux/sysret.cc b/test/syscalls/linux/sysret.cc
index 819fa655a..569190a59 100644
--- a/test/syscalls/linux/sysret.cc
+++ b/test/syscalls/linux/sysret.cc
@@ -14,6 +14,8 @@
 
 // Tests to verify that the behavior of linux and gvisor matches when
 // 'sysret' returns to bad (aka non-canonical) %rip or %rsp.
+
+#include <linux/elf.h>
 #include <sys/ptrace.h>
 #include <sys/user.h>
 
@@ -32,6 +34,7 @@ constexpr uint64_t kNonCanonicalRsp = 0xFFFF000000000000;
 class SysretTest : public ::testing::Test {
  protected:
   struct user_regs_struct regs_;
+  struct iovec iov;
   pid_t child_;
 
   void SetUp() override {
@@ -48,10 +51,14 @@ class SysretTest : public ::testing::Test {
 
     // Parent.
     int status;
+    memset(&iov, 0, sizeof(iov));
     ASSERT_THAT(pid, SyscallSucceeds());  // Might still be < 0.
     ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
     EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
-    ASSERT_THAT(ptrace(PTRACE_GETREGS, pid, 0, &regs_), SyscallSucceeds());
+
+    iov.iov_base = &regs_;
+    iov.iov_len = sizeof(regs_);
+    ASSERT_THAT(ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov), SyscallSucceeds());
 
     child_ = pid;
   }
@@ -61,13 +68,25 @@ class SysretTest : public ::testing::Test {
   }
 
   void SetRip(uint64_t newrip) {
+#if defined(__x86_64__)
     regs_.rip = newrip;
-    ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
+#elif defined(__aarch64__)
+    regs_.pc = newrip;
+#else
+#error "Unknown architecture"
+#endif
+    ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov), SyscallSucceeds());
   }
 
   void SetRsp(uint64_t newrsp) {
+#if defined(__x86_64__)
     regs_.rsp = newrsp;
-    ASSERT_THAT(ptrace(PTRACE_SETREGS, child_, 0, &regs_), SyscallSucceeds());
+#elif defined(__aarch64__)
+    regs_.sp = newrsp;
+#else
+#error "Unknown architecture"
+#endif
+    ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov), SyscallSucceeds());
   }
 
   // Wait waits for the child pid and returns the exit status.
@@ -104,8 +123,15 @@ TEST_F(SysretTest, BadRsp) {
   SetRsp(kNonCanonicalRsp);
   Detach();
   int status = Wait();
+#if defined(__x86_64__)
   EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGBUS)
       << "status = " << status;
+#elif defined(__aarch64__)
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV)
+      << "status = " << status;
+#else
+#error "Unknown architecture"
+#endif
 }
 }  // namespace
 
-- 
cgit v1.2.3


From a730d74b3230fb32181b9a940c07b61338222874 Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Mon, 23 Mar 2020 16:11:37 -0700
Subject: Support basic /proc/net/dev metrics for netstack

Fixes #506

PiperOrigin-RevId: 302540404
---
 pkg/sentry/socket/netstack/stack.go | 73 ++++++++++++++++++++++++++-----------
 test/syscalls/linux/proc_net.cc     | 53 +++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 21 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index 0692482e9..a8e2e8c24 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -200,36 +200,66 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
 // Statistics implements inet.Stack.Statistics.
 func (s *Stack) Statistics(stat interface{}, arg string) error {
 	switch stats := stat.(type) {
+	case *inet.StatDev:
+		for _, ni := range s.Stack.NICInfo() {
+			if ni.Name != arg {
+				continue
+			}
+			// TODO(gvisor.dev/issue/2103) Support stubbed stats.
+			*stats = inet.StatDev{
+				// Receive section.
+				ni.Stats.Rx.Bytes.Value(),   // bytes.
+				ni.Stats.Rx.Packets.Value(), // packets.
+				0,                           // errs.
+				0,                           // drop.
+				0,                           // fifo.
+				0,                           // frame.
+				0,                           // compressed.
+				0,                           // multicast.
+				// Transmit section.
+				ni.Stats.Tx.Bytes.Value(),   // bytes.
+				ni.Stats.Tx.Packets.Value(), // packets.
+				0,                           // errs.
+				0,                           // drop.
+				0,                           // fifo.
+				0,                           // colls.
+				0,                           // carrier.
+				0,                           // compressed.
+			}
+			break
+		}
 	case *inet.StatSNMPIP:
 		ip := Metrics.IP
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPIP{
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/Forwarding.
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL.
+			0,                          // Ip/Forwarding.
+			0,                          // Ip/DefaultTTL.
 			ip.PacketsReceived.Value(), // InReceives.
-			0,                          // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors.
+			0,                          // Ip/InHdrErrors.
 			ip.InvalidDestinationAddressesReceived.Value(), // InAddrErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/InDiscards.
+			0,                               // Ip/ForwDatagrams.
+			0,                               // Ip/InUnknownProtos.
+			0,                               // Ip/InDiscards.
 			ip.PacketsDelivered.Value(),     // InDelivers.
 			ip.PacketsSent.Value(),          // OutRequests.
 			ip.OutgoingPacketErrors.Value(), // OutDiscards.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragOKs.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragFails.
-			0,                               // TODO(gvisor.dev/issue/969): Support Ip/FragCreates.
+			0,                               // Ip/OutNoRoutes.
+			0,                               // Support Ip/ReasmTimeout.
+			0,                               // Support Ip/ReasmReqds.
+			0,                               // Support Ip/ReasmOKs.
+			0,                               // Support Ip/ReasmFails.
+			0,                               // Support Ip/FragOKs.
+			0,                               // Support Ip/FragFails.
+			0,                               // Support Ip/FragCreates.
 		}
 	case *inet.StatSNMPICMP:
 		in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats
 		out := Metrics.ICMP.V4PacketsSent.ICMPv4PacketStats
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPICMP{
-			0, // TODO(gvisor.dev/issue/969): Support Icmp/InMsgs.
+			0, // Icmp/InMsgs.
 			Metrics.ICMP.V4PacketsSent.Dropped.Value(), // InErrors.
-			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/InCsumErrors.
+			0,                         // Icmp/InCsumErrors.
 			in.DstUnreachable.Value(), // InDestUnreachs.
 			in.TimeExceeded.Value(),   // InTimeExcds.
 			in.ParamProblem.Value(),   // InParmProbs.
@@ -241,7 +271,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 			in.TimestampReply.Value(), // InTimestampReps.
 			in.InfoRequest.Value(),    // InAddrMasks.
 			in.InfoReply.Value(),      // InAddrMaskReps.
-			0,                         // TODO(gvisor.dev/issue/969): Support Icmp/OutMsgs.
+			0,                         // Icmp/OutMsgs.
 			Metrics.ICMP.V4PacketsReceived.Invalid.Value(), // OutErrors.
 			out.DstUnreachable.Value(),                     // OutDestUnreachs.
 			out.TimeExceeded.Value(),                       // OutTimeExcds.
@@ -277,15 +307,16 @@ func (s *Stack) Statistics(stat interface{}, arg string) error {
 		}
 	case *inet.StatSNMPUDP:
 		udp := Metrics.UDP
+		// TODO(gvisor.dev/issue/969) Support stubbed stats.
 		*stats = inet.StatSNMPUDP{
 			udp.PacketsReceived.Value(),     // InDatagrams.
 			udp.UnknownPortErrors.Value(),   // NoPorts.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InErrors.
+			0,                               // Udp/InErrors.
 			udp.PacketsSent.Value(),         // OutDatagrams.
 			udp.ReceiveBufferErrors.Value(), // RcvbufErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/SndbufErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/InCsumErrors.
-			0,                               // TODO(gvisor.dev/issue/969): Support Udp/IgnoredMulti.
+			0,                               // Udp/SndbufErrors.
+			0,                               // Udp/InCsumErrors.
+			0,                               // Udp/IgnoredMulti.
 		}
 	default:
 		return syserr.ErrEndpointOperation.ToError()
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 05c952b99..4e23d1e78 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -92,6 +92,59 @@ TEST(ProcSysNetIpv4Sack, CanReadAndWrite) {
   EXPECT_EQ(buf, to_write);
 }
 
+// DeviceEntry is an entry in /proc/net/dev
+struct DeviceEntry {
+  std::string name;
+  uint64_t stats[16];
+};
+
+PosixErrorOr<std::vector<DeviceEntry>> GetDeviceMetricsFromProc(
+    const std::string dev) {
+  std::vector<std::string> lines = absl::StrSplit(dev, '\n');
+  std::vector<DeviceEntry> entries;
+
+  // /proc/net/dev prints 2 lines of headers followed by a line of metrics for
+  // each network interface.
+  for (unsigned i = 2; i < lines.size(); i++) {
+    // Ignore empty lines.
+    if (lines[i].empty()) {
+      continue;
+    }
+
+    std::vector<std::string> values =
+        absl::StrSplit(lines[i], ' ', absl::SkipWhitespace());
+
+    // Interface name + 16 values.
+    if (values.size() != 17) {
+      return PosixError(EINVAL, "invalid line: " + lines[i]);
+    }
+
+    DeviceEntry entry;
+    entry.name = values[0];
+    // Skip the interface name and read only the values.
+    for (unsigned j = 1; j < 17; j++) {
+      uint64_t num;
+      if (!absl::SimpleAtoi(values[j], &num)) {
+        return PosixError(EINVAL, "invalid value: " + values[j]);
+      }
+      entry.stats[j - 1] = num;
+    }
+
+    entries.push_back(entry);
+  }
+
+  return entries;
+}
+
+// TEST(ProcNetDev, Format) tests that /proc/net/dev is parsable and
+// contains at least one entry.
+TEST(ProcNetDev, Format) {
+  auto dev = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/dev"));
+  auto entries = ASSERT_NO_ERRNO_AND_VALUE(GetDeviceMetricsFromProc(dev));
+
+  EXPECT_GT(entries.size(), 0);
+}
+
 PosixErrorOr<uint64_t> GetSNMPMetricFromProc(const std::string snmp,
                                              const std::string& type,
                                              const std::string& item) {
-- 
cgit v1.2.3


From f97858011fa88b539585ca456943922204d92840 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 24 Mar 2020 10:57:24 -0700
Subject: Open a temp directory before changing capabilities and user ID-s

In cl/302130790, we started using a temp directory which is provided by bazel.

By default, a test process has enough permissions to open it, but there is not
any guarantee that it still will be able to do this after changing credentials.

PiperOrigin-RevId: 302702337
---
 test/syscalls/linux/sticky.cc | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/sticky.cc b/test/syscalls/linux/sticky.cc
index 7e73325bf..92eec0449 100644
--- a/test/syscalls/linux/sticky.cc
+++ b/test/syscalls/linux/sticky.cc
@@ -42,8 +42,9 @@ TEST(StickyTest, StickyBitPermDenied) {
 
   auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
-  std::string path = JoinPath(dir.path(), "NewDir");
-  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY));
+  ASSERT_THAT(mkdirat(dirfd.get(), "NewDir", 0755), SyscallSucceeds());
 
   // Drop privileges and change IDs only in child thread, or else this parent
   // thread won't be able to open some log files after the test ends.
@@ -61,7 +62,8 @@ TEST(StickyTest, StickyBitPermDenied) {
         syscall(SYS_setresuid, -1, absl::GetFlag(FLAGS_scratch_uid), -1),
         SyscallSucceeds());
 
-    EXPECT_THAT(rmdir(path.c_str()), SyscallFailsWithErrno(EPERM));
+    EXPECT_THAT(unlinkat(dirfd.get(), "NewDir", AT_REMOVEDIR),
+                SyscallFailsWithErrno(EPERM));
   });
 }
 
@@ -96,8 +98,9 @@ TEST(StickyTest, StickyBitCapFOWNER) {
 
   auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
-  std::string path = JoinPath(dir.path(), "NewDir");
-  ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+  const FileDescriptor dirfd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY));
+  ASSERT_THAT(mkdirat(dirfd.get(), "NewDir", 0755), SyscallSucceeds());
 
   // Drop privileges and change IDs only in child thread, or else this parent
   // thread won't be able to open some log files after the test ends.
@@ -114,7 +117,8 @@ TEST(StickyTest, StickyBitCapFOWNER) {
         SyscallSucceeds());
 
     EXPECT_NO_ERRNO(SetCapability(CAP_FOWNER, true));
-    EXPECT_THAT(rmdir(path.c_str()), SyscallSucceeds());
+    EXPECT_THAT(unlinkat(dirfd.get(), "NewDir", AT_REMOVEDIR),
+                SyscallSucceeds());
   });
 }
 }  // namespace
-- 
cgit v1.2.3


From e541ebec2fdb5b29209cb3fc8235b77edcaebb6a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 25 Mar 2020 14:54:10 -0700
Subject: Misc fixes to make stat_test pass (almost)

The only test failing now requires socket which is not
available in VFS2 yet.

Updates #1198

PiperOrigin-RevId: 302976572
---
 pkg/bits/bits_template.go                    |  8 ++++++
 pkg/bits/uint64_test.go                      | 18 ++++++++++++
 pkg/sentry/fsimpl/gofer/filesystem.go        | 16 +++++++++--
 pkg/sentry/fsimpl/gofer/gofer.go             | 41 ++++++++++++++++++++++++----
 pkg/sentry/syscalls/linux/vfs2/BUILD         |  1 +
 pkg/sentry/syscalls/linux/vfs2/filesystem.go |  2 +-
 pkg/sentry/syscalls/linux/vfs2/getdents.go   |  4 +--
 pkg/sentry/syscalls/linux/vfs2/stat.go       |  7 ++++-
 pkg/sentry/vfs/resolving_path.go             | 16 +++++++++--
 test/syscalls/linux/stat.cc                  | 11 +++++++-
 10 files changed, 109 insertions(+), 15 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/bits/bits_template.go b/pkg/bits/bits_template.go
index 93a435b80..998645388 100644
--- a/pkg/bits/bits_template.go
+++ b/pkg/bits/bits_template.go
@@ -42,3 +42,11 @@ func Mask(is ...int) T {
 func MaskOf(i int) T {
 	return T(1) << T(i)
 }
+
+// IsPowerOfTwo returns true if v is power of 2.
+func IsPowerOfTwo(v T) bool {
+	if v == 0 {
+		return false
+	}
+	return v&(v-1) == 0
+}
diff --git a/pkg/bits/uint64_test.go b/pkg/bits/uint64_test.go
index 1b018d808..193d1ebcd 100644
--- a/pkg/bits/uint64_test.go
+++ b/pkg/bits/uint64_test.go
@@ -114,3 +114,21 @@ func TestIsOn(t *testing.T) {
 		}
 	}
 }
+
+func TestIsPowerOfTwo(t *testing.T) {
+	for _, tc := range []struct {
+		v    uint64
+		want bool
+	}{
+		{v: 0, want: false},
+		{v: 1, want: true},
+		{v: 2, want: true},
+		{v: 3, want: false},
+		{v: 4, want: true},
+		{v: 5, want: false},
+	} {
+		if got := IsPowerOfTwo64(tc.v); got != tc.want {
+			t.Errorf("IsPowerOfTwo(%d) = %t, want: %t", tc.v, got, tc.want)
+		}
+	}
+}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 38e4cdbc5..26b492185 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -454,6 +454,9 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
 	}
 	if fs.opts.interop != InteropModeShared {
 		parent.touchCMtime(ctx)
+		if dir {
+			parent.decLinks()
+		}
 		parent.cacheNegativeChildLocked(name)
 		parent.dirents = nil
 	}
@@ -569,8 +572,13 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
 	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
 		creds := rp.Credentials()
-		_, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
-		return err
+		if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
+			return err
+		}
+		if fs.opts.interop != InteropModeShared {
+			parent.incLinks()
+		}
+		return nil
 	})
 }
 
@@ -962,6 +970,10 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		oldParent.dirents = nil
 		delete(newParent.negativeChildren, newName)
 		newParent.dirents = nil
+		if renamed.isDir() {
+			oldParent.decLinks()
+			newParent.incLinks()
+		}
 	}
 	vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD)
 	return nil
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 999485492..13928ce36 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -485,6 +485,11 @@ type dentry struct {
 	// locked to mutate it).
 	size uint64
 
+	// nlink counts the number of hard links to this dentry. It's updated and
+	// accessed using atomic operations. It's not protected by metadataMu like the
+	// other metadata fields.
+	nlink uint32
+
 	mapsMu sync.Mutex
 
 	// If this dentry represents a regular file, mappings tracks mappings of
@@ -604,6 +609,9 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 	if mask.BTime {
 		d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)
 	}
+	if mask.NLink {
+		d.nlink = uint32(attr.NLink)
+	}
 	d.vfsd.Init(d)
 
 	fs.syncMu.Lock()
@@ -645,6 +653,9 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
 	if mask.BTime {
 		atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds))
 	}
+	if mask.NLink {
+		atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
+	}
 	if mask.Size {
 		d.dataMu.Lock()
 		atomic.StoreUint64(&d.size, attr.Size)
@@ -687,10 +698,7 @@ func (d *dentry) fileType() uint32 {
 func (d *dentry) statTo(stat *linux.Statx) {
 	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
 	stat.Blksize = atomic.LoadUint32(&d.blockSize)
-	stat.Nlink = 1
-	if d.isDir() {
-		stat.Nlink = 2
-	}
+	stat.Nlink = atomic.LoadUint32(&d.nlink)
 	stat.UID = atomic.LoadUint32(&d.uid)
 	stat.GID = atomic.LoadUint32(&d.gid)
 	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
@@ -703,7 +711,7 @@ func (d *dentry) statTo(stat *linux.Statx) {
 	stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
 	stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
 	stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
-	// TODO(jamieliu): device number
+	// TODO(gvisor.dev/issue/1198): device number
 }
 
 func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error {
@@ -1094,6 +1102,26 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 	return nil
 }
 
+// incLinks increments link count.
+//
+// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32.
+func (d *dentry) incLinks() {
+	v := atomic.AddUint32(&d.nlink, 1)
+	if v < 2 {
+		panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v))
+	}
+}
+
+// decLinks decrements link count.
+//
+// Preconditions: d.nlink > 1.
+func (d *dentry) decLinks() {
+	v := atomic.AddUint32(&d.nlink, ^uint32(0))
+	if v == 0 {
+		panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v))
+	}
+}
+
 // fileDescription is embedded by gofer implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
@@ -1112,7 +1140,8 @@ func (fd *fileDescription) dentry() *dentry {
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	d := fd.dentry()
-	if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
+	const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
+	if d.fs.opts.interop == InteropModeShared && opts.Mask&(validMask) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
 		// TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
 		// available?
 		if err := d.updateFromGetattr(ctx); err != nil {
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index e7695e995..2eb210014 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -31,6 +31,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/bits",
         "//pkg/fspath",
         "//pkg/gohacks",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
index fc5ceea4c..a859095e2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go
+++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go
@@ -250,7 +250,7 @@ func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
 	if err != nil {
 		return err
 	}
-	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
+	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go
index ddc140b65..a61cc5059 100644
--- a/pkg/sentry/syscalls/linux/vfs2/getdents.go
+++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go
@@ -97,7 +97,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 		//     char           d_name[]; /* Filename (null-terminated) */
 		// };
 		size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
-		if size < cb.remaining {
+		if size > cb.remaining {
 			return syserror.EINVAL
 		}
 		buf = cb.t.CopyScratchBuffer(size)
@@ -125,7 +125,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
 			panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
 		}
 		size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name)
-		if size < cb.remaining {
+		if size > cb.remaining {
 			return syserror.EINVAL
 		}
 		buf = cb.t.CopyScratchBuffer(size)
diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go
index 068243132..fdfe49243 100644
--- a/pkg/sentry/syscalls/linux/vfs2/stat.go
+++ b/pkg/sentry/syscalls/linux/vfs2/stat.go
@@ -16,6 +16,7 @@ package vfs2
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/gohacks"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
@@ -153,7 +154,11 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 {
 		return 0, nil, syserror.EINVAL
 	}
-
+	// Make sure that only one sync type option is set.
+	syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE)
+	if syncType != 0 && !bits.IsPowerOfTwo32(syncType) {
+		return 0, nil, syserror.EINVAL
+	}
 	if mask&linux.STATX__RESERVED != 0 {
 		return 0, nil, syserror.EINVAL
 	}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index eb4ebb511..8f31495da 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -329,10 +329,22 @@ func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) {
 // component in pcs represents a symbolic link, the symbolic link should be
 // followed.
 //
+// If path is terminated with '/', the '/' is considered the last element and
+// any symlink before that is followed:
+//   - For most non-creating walks, the last path component is handled by
+//     fs/namei.c:lookup_last(), which sets LOOKUP_FOLLOW if the first byte
+//     after the path component is non-NULL (which is only possible if it's '/')
+//     and the path component is of type LAST_NORM.
+//
+//   - For open/openat/openat2 without O_CREAT, the last path component is
+//     handled by fs/namei.c:do_last(), which does the same, though without the
+//     LAST_NORM check.
+//
 // Preconditions: !rp.Done().
 func (rp *ResolvingPath) ShouldFollowSymlink() bool {
-	// Non-final symlinks are always followed.
-	return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final()
+	// Non-final symlinks are always followed. Paths terminated with '/' are also
+	// always followed.
+	return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() || rp.MustBeDir()
 }
 
 // HandleSymlink is called when the current path component is a symbolic link
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 513b9cd1c..2503960f3 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -34,6 +34,13 @@
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 
+#ifndef AT_STATX_FORCE_SYNC
+#define AT_STATX_FORCE_SYNC 0x2000
+#endif
+#ifndef AT_STATX_DONT_SYNC
+#define AT_STATX_DONT_SYNC 0x4000
+#endif
+
 namespace gvisor {
 namespace testing {
 
@@ -700,8 +707,10 @@ TEST_F(StatTest, StatxInvalidFlags) {
   struct kernel_statx stx;
   EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), 12345, 0, &stx),
               SyscallFailsWithErrno(EINVAL));
+
+  // Sync flags are mutually exclusive.
   EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(),
-                    0x6000 /* AT_STATX_SYNC_TYPE */, 0, &stx),
+                    AT_STATX_FORCE_SYNC | AT_STATX_DONT_SYNC, 0, &stx),
               SyscallFailsWithErrno(EINVAL));
 }
 
-- 
cgit v1.2.3


From 8ce5b569714351f9f2f7fc48b0ff0bebbdb018ee Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 20 Mar 2020 08:45:07 +0000
Subject: Cleanup for syscall tests on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I8008c0375fc7e23225a21026f359e78e691729e5
---
 test/syscalls/linux/getrandom.cc |  2 ++
 test/syscalls/linux/lseek.cc     |  2 +-
 test/syscalls/linux/mlock.cc     |  4 +++-
 test/syscalls/linux/mmap.cc      | 10 ++++++++--
 4 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/getrandom.cc b/test/syscalls/linux/getrandom.cc
index f97f60029..f87cdd7a1 100644
--- a/test/syscalls/linux/getrandom.cc
+++ b/test/syscalls/linux/getrandom.cc
@@ -29,6 +29,8 @@ namespace {
 #define SYS_getrandom 318
 #elif defined(__i386__)
 #define SYS_getrandom 355
+#elif defined(__aarch64__)
+#define SYS_getrandom 278
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/lseek.cc b/test/syscalls/linux/lseek.cc
index a8af8e545..6ce1e6cc3 100644
--- a/test/syscalls/linux/lseek.cc
+++ b/test/syscalls/linux/lseek.cc
@@ -53,7 +53,7 @@ TEST(LseekTest, NegativeOffset) {
 // A 32-bit off_t is not large enough to represent an offset larger than
 // maximum file size on standard file systems, so it isn't possible to cause
 // overflow.
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__aarch64__)
 TEST(LseekTest, Overflow) {
   // HA! Classic Linux. We really should have an EOVERFLOW
   // here, since we're seeking to something that cannot be
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
index 367a90fe1..78ac96bed 100644
--- a/test/syscalls/linux/mlock.cc
+++ b/test/syscalls/linux/mlock.cc
@@ -199,8 +199,10 @@ TEST(MunlockallTest, Basic) {
 }
 
 #ifndef SYS_mlock2
-#ifdef __x86_64__
+#if defined(__x86_64__)
 #define SYS_mlock2 325
+#elif defined(__aarch64__)
+#define SYS_mlock2 284
 #endif
 #endif
 
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 11fb1b457..6d3227ab6 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -361,7 +361,7 @@ TEST_F(MMapTest, MapFixed) {
 }
 
 // 64-bit addresses work too
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__aarch64__)
 TEST_F(MMapTest, MapFixed64) {
   EXPECT_THAT(Map(0x300000000000, kPageSize, PROT_NONE,
                   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0),
@@ -571,6 +571,12 @@ const uint8_t machine_code[] = {
     0xb8, 0x2a, 0x00, 0x00, 0x00,  // movl $42, %eax
     0xc3,                          // retq
 };
+#elif defined(__aarch64__)
+const uint8_t machine_code[] = {
+    0x40, 0x05, 0x80, 0x52,  // mov w0, #42
+    0xc0, 0x03, 0x5f, 0xd6,  // ret
+};
+#endif
 
 // PROT_EXEC allows code execution
 TEST_F(MMapTest, ProtExec) {
@@ -605,7 +611,6 @@ TEST_F(MMapTest, NoProtExecDeath) {
 
   EXPECT_EXIT(func(), ::testing::KilledBySignal(SIGSEGV), "");
 }
-#endif
 
 TEST_F(MMapTest, NoExceedLimitData) {
   void* prevbrk;
@@ -1644,6 +1649,7 @@ TEST(MMapNoFixtureTest, MapReadOnlyAfterCreateWriteOnly) {
 }
 
 // Conditional on MAP_32BIT.
+// This flag is supported only on x86-64, for 64-bit programs.
 #ifdef __x86_64__
 
 TEST(MMapNoFixtureTest, Map32Bit) {
-- 
cgit v1.2.3


From c71e97784cfc57a0664a07cb798aca3d39d6bb11 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Fri, 28 Feb 2020 09:14:57 +0000
Subject: Enable rseq syscall test on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: If30154a2d73e98f211cfe589853b232019b9e130
---
 test/syscalls/linux/rseq/BUILD            | 41 ++++++++++---------
 test/syscalls/linux/rseq/critical.S       | 66 -------------------------------
 test/syscalls/linux/rseq/critical_amd64.S | 66 +++++++++++++++++++++++++++++++
 test/syscalls/linux/rseq/critical_arm64.S | 66 +++++++++++++++++++++++++++++++
 test/syscalls/linux/rseq/start.S          | 45 ---------------------
 test/syscalls/linux/rseq/start_amd64.S    | 45 +++++++++++++++++++++
 test/syscalls/linux/rseq/start_arm64.S    | 45 +++++++++++++++++++++
 test/syscalls/linux/rseq/syscalls.h       |  5 ++-
 test/syscalls/linux/rseq/uapi.h           |  4 +-
 9 files changed, 251 insertions(+), 132 deletions(-)
 delete mode 100644 test/syscalls/linux/rseq/critical.S
 create mode 100644 test/syscalls/linux/rseq/critical_amd64.S
 create mode 100644 test/syscalls/linux/rseq/critical_arm64.S
 delete mode 100644 test/syscalls/linux/rseq/start.S
 create mode 100644 test/syscalls/linux/rseq/start_amd64.S
 create mode 100644 test/syscalls/linux/rseq/start_arm64.S

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD
index ed488dbc2..ee5b0a11b 100644
--- a/test/syscalls/linux/rseq/BUILD
+++ b/test/syscalls/linux/rseq/BUILD
@@ -1,7 +1,7 @@
 # This package contains a standalone rseq test binary. This binary must not
 # depend on libc, which might use rseq itself.
 
-load("//tools:defs.bzl", "cc_flags_supplier", "cc_library", "cc_toolchain")
+load("//tools:defs.bzl", "cc_flags_supplier", "cc_library", "cc_toolchain", "select_arch")
 
 package(licenses = ["notice"])
 
@@ -9,32 +9,35 @@ genrule(
     name = "rseq_binary",
     srcs = [
         "critical.h",
-        "critical.S",
+        "critical_amd64.S",
+        "critical_arm64.S",
         "rseq.cc",
         "syscalls.h",
-        "start.S",
+        "start_amd64.S",
+        "start_arm64.S",
         "test.h",
         "types.h",
         "uapi.h",
     ],
     outs = ["rseq"],
-    cmd = " ".join([
-        "$(CC)",
-        "$(CC_FLAGS) ",
-        "-I.",
-        "-Wall",
-        "-Werror",
-        "-O2",
-        "-std=c++17",
-        "-static",
-        "-nostdlib",
-        "-ffreestanding",
-        "-o",
-        "$(location rseq)",
-        "$(location critical.S)",
+    cmd = "$(CC) " +
+        "$(CC_FLAGS) " +
+        "-I. " +
+        "-Wall " +
+        "-Werror " +
+        "-O2 " +
+        "-std=c++17 " +
+        "-static " +
+        "-nostdlib " +
+        "-ffreestanding " +
+        "-o " +
+        "$(location rseq) " +
+        select_arch(
+            amd64 = "$(location critical_amd64.S) $(location start_amd64.S) ",
+            arm64 = "$(location critical_arm64.S) $(location start_arm64.S) ",
+	    no_match_error = "unsupported architecture",
+        ) +
         "$(location rseq.cc)",
-        "$(location start.S)",
-    ]),
     toolchains = [
         cc_toolchain,
         ":no_pie_cc_flags",
diff --git a/test/syscalls/linux/rseq/critical.S b/test/syscalls/linux/rseq/critical.S
deleted file mode 100644
index 8c0687e6d..000000000
--- a/test/syscalls/linux/rseq/critical.S
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Restartable sequences critical sections.
-
-// Loops continuously until aborted.
-//
-// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
-
-  .text
-  .globl  rseq_loop
-  .type   rseq_loop, @function
-
-rseq_loop:
-  jmp begin
-
-  // Abort block before the critical section.
-  // Abort signature is 4 nops for simplicity.
-  .byte 0x90, 0x90, 0x90, 0x90
-  .globl  rseq_loop_early_abort
-rseq_loop_early_abort:
-  ret
-
-begin:
-  // r->rseq_cs = cs
-  movq %rsi, 8(%rdi)
-
-  // N.B. rseq_cs will be cleared by any preempt, even outside the critical
-  // section. Thus it must be set in or immediately before the critical section
-  // to ensure it is not cleared before the section begins.
-  .globl  rseq_loop_start
-rseq_loop_start:
-  jmp rseq_loop_start
-
-  // "Pre-commit": extra instructions inside the critical section.  These are
-  // used as the abort point in TestAbortPreCommit, which is not valid.
-  .globl  rseq_loop_pre_commit
-rseq_loop_pre_commit:
-  // Extra abort signature + nop for TestAbortPostCommit.
-  .byte 0x90, 0x90, 0x90, 0x90
-  nop
-
-  // "Post-commit": never reached in this case.
-  .globl  rseq_loop_post_commit
-rseq_loop_post_commit:
-
-  // Abort signature is 4 nops for simplicity.
-  .byte 0x90, 0x90, 0x90, 0x90
-
-  .globl  rseq_loop_abort
-rseq_loop_abort:
-  ret
-
-  .size  rseq_loop,.-rseq_loop
-  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/critical_amd64.S b/test/syscalls/linux/rseq/critical_amd64.S
new file mode 100644
index 000000000..8c0687e6d
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical_amd64.S
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Restartable sequences critical sections.
+
+// Loops continuously until aborted.
+//
+// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
+
+  .text
+  .globl  rseq_loop
+  .type   rseq_loop, @function
+
+rseq_loop:
+  jmp begin
+
+  // Abort block before the critical section.
+  // Abort signature is 4 nops for simplicity.
+  .byte 0x90, 0x90, 0x90, 0x90
+  .globl  rseq_loop_early_abort
+rseq_loop_early_abort:
+  ret
+
+begin:
+  // r->rseq_cs = cs
+  movq %rsi, 8(%rdi)
+
+  // N.B. rseq_cs will be cleared by any preempt, even outside the critical
+  // section. Thus it must be set in or immediately before the critical section
+  // to ensure it is not cleared before the section begins.
+  .globl  rseq_loop_start
+rseq_loop_start:
+  jmp rseq_loop_start
+
+  // "Pre-commit": extra instructions inside the critical section.  These are
+  // used as the abort point in TestAbortPreCommit, which is not valid.
+  .globl  rseq_loop_pre_commit
+rseq_loop_pre_commit:
+  // Extra abort signature + nop for TestAbortPostCommit.
+  .byte 0x90, 0x90, 0x90, 0x90
+  nop
+
+  // "Post-commit": never reached in this case.
+  .globl  rseq_loop_post_commit
+rseq_loop_post_commit:
+
+  // Abort signature is 4 nops for simplicity.
+  .byte 0x90, 0x90, 0x90, 0x90
+
+  .globl  rseq_loop_abort
+rseq_loop_abort:
+  ret
+
+  .size  rseq_loop,.-rseq_loop
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/critical_arm64.S b/test/syscalls/linux/rseq/critical_arm64.S
new file mode 100644
index 000000000..bfe7e8307
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical_arm64.S
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Restartable sequences critical sections.
+
+// Loops continuously until aborted.
+//
+// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
+
+  .text
+  .globl  rseq_loop
+  .type   rseq_loop, @function
+
+rseq_loop:
+  b begin
+
+  // Abort block before the critical section.
+  // Abort signature.
+  .byte 0x90, 0x90, 0x90, 0x90
+  .globl  rseq_loop_early_abort
+rseq_loop_early_abort:
+  ret
+
+begin:
+  // r->rseq_cs = cs
+  str x1, [x0, #8]
+
+  // N.B. rseq_cs will be cleared by any preempt, even outside the critical
+  // section. Thus it must be set in or immediately before the critical section
+  // to ensure it is not cleared before the section begins.
+  .globl  rseq_loop_start
+rseq_loop_start:
+  b rseq_loop_start
+
+  // "Pre-commit": extra instructions inside the critical section.  These are
+  // used as the abort point in TestAbortPreCommit, which is not valid.
+  .globl  rseq_loop_pre_commit
+rseq_loop_pre_commit:
+  // Extra abort signature + nop for TestAbortPostCommit.
+  .byte 0x90, 0x90, 0x90, 0x90
+  nop
+
+  // "Post-commit": never reached in this case.
+  .globl  rseq_loop_post_commit
+rseq_loop_post_commit:
+
+  // Abort signature.
+  .byte 0x90, 0x90, 0x90, 0x90
+
+  .globl  rseq_loop_abort
+rseq_loop_abort:
+  ret
+
+  .size  rseq_loop,.-rseq_loop
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/start.S b/test/syscalls/linux/rseq/start.S
deleted file mode 100644
index b9611b276..000000000
--- a/test/syscalls/linux/rseq/start.S
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-  .text
-  .align 4
-  .type  _start,@function
-  .globl  _start
-
-_start:
-  movq  %rsp,%rdi
-  call  __init
-  hlt
-
-  .size  _start,.-_start
-  .section  .note.GNU-stack,"",@progbits
-
-  .text
-  .globl  raw_syscall
-  .type   raw_syscall, @function
-
-raw_syscall:
-  mov  %rdi,%rax      // syscall #
-  mov  %rsi,%rdi      // arg0
-  mov  %rdx,%rsi      // arg1
-  mov  %rcx,%rdx      // arg2
-  mov  %r8,%r10       // arg3 (goes in r10 instead of rcx for system calls)
-  mov  %r9,%r8        // arg4
-  mov  0x8(%rsp),%r9  // arg5
-  syscall
-  ret
-
-  .size  raw_syscall,.-raw_syscall
-  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/start_amd64.S b/test/syscalls/linux/rseq/start_amd64.S
new file mode 100644
index 000000000..b9611b276
--- /dev/null
+++ b/test/syscalls/linux/rseq/start_amd64.S
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+  .text
+  .align 4
+  .type  _start,@function
+  .globl  _start
+
+_start:
+  movq  %rsp,%rdi
+  call  __init
+  hlt
+
+  .size  _start,.-_start
+  .section  .note.GNU-stack,"",@progbits
+
+  .text
+  .globl  raw_syscall
+  .type   raw_syscall, @function
+
+raw_syscall:
+  mov  %rdi,%rax      // syscall #
+  mov  %rsi,%rdi      // arg0
+  mov  %rdx,%rsi      // arg1
+  mov  %rcx,%rdx      // arg2
+  mov  %r8,%r10       // arg3 (goes in r10 instead of rcx for system calls)
+  mov  %r9,%r8        // arg4
+  mov  0x8(%rsp),%r9  // arg5
+  syscall
+  ret
+
+  .size  raw_syscall,.-raw_syscall
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/start_arm64.S b/test/syscalls/linux/rseq/start_arm64.S
new file mode 100644
index 000000000..693c1c6eb
--- /dev/null
+++ b/test/syscalls/linux/rseq/start_arm64.S
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+  .text
+  .align 4
+  .type  _start,@function
+  .globl  _start
+
+_start:
+  mov  x29, sp
+  bl   __init
+  wfi
+
+  .size  _start,.-_start
+  .section  .note.GNU-stack,"",@progbits
+
+  .text
+  .globl  raw_syscall
+  .type   raw_syscall, @function
+
+raw_syscall:
+  mov  x8,x0   // syscall #
+  mov  x0,x1   // arg0
+  mov  x1,x2   // arg1
+  mov  x2,x3   // arg2
+  mov  x3,x4   // arg3
+  mov  x4,x5   // arg4
+  mov  x5,x6   // arg5
+  svc  #0
+  ret
+
+  .size  raw_syscall,.-raw_syscall
+  .section  .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/syscalls.h b/test/syscalls/linux/rseq/syscalls.h
index e5299c188..c4118e6c5 100644
--- a/test/syscalls/linux/rseq/syscalls.h
+++ b/test/syscalls/linux/rseq/syscalls.h
@@ -17,10 +17,13 @@
 
 #include "test/syscalls/linux/rseq/types.h"
 
-#ifdef __x86_64__
 // Syscall numbers.
+#if defined(__x86_64__)
 constexpr int kGetpid = 39;
 constexpr int kExitGroup = 231;
+#elif defined(__aarch64__)
+constexpr int kGetpid = 172;
+constexpr int kExitGroup = 94;
 #else
 #error "Unknown architecture"
 #endif
diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h
index ca1d67691..d3e60d0a4 100644
--- a/test/syscalls/linux/rseq/uapi.h
+++ b/test/syscalls/linux/rseq/uapi.h
@@ -19,9 +19,11 @@
 
 // User-kernel ABI for restartable sequences.
 
-#ifdef __x86_64__
 // Syscall numbers.
+#if defined(__x86_64__)
 constexpr int kRseqSyscall = 334;
+#elif defined(__aarch64__)
+constexpr int kRseqSyscall = 293;
 #else
 #error "Unknown architecture"
 #endif  // __x86_64__
-- 
cgit v1.2.3


From 639d94f9f71b43e86320a6e9157c932f5d7936a7 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 31 Mar 2020 19:15:55 -0700
Subject: Add socket filesystem and global disconnected socket mount for VFS2.

A socket mount where anonymous sockets will reside is added to the
VirtualFilesystem. Socketfs is built on top of kernfs.

Updates #1476, #1478, #1484, #1485.

PiperOrigin-RevId: 304095251
---
 pkg/sentry/fsimpl/sockfs/BUILD     | 16 +++++++++
 pkg/sentry/fsimpl/sockfs/sockfs.go | 73 ++++++++++++++++++++++++++++++++++++++
 pkg/sentry/kernel/BUILD            |  1 +
 pkg/sentry/kernel/kernel.go        | 24 +++++++++++++
 test/syscalls/linux/socket_unix.cc |  2 ++
 5 files changed, 116 insertions(+)
 create mode 100644 pkg/sentry/fsimpl/sockfs/BUILD
 create mode 100644 pkg/sentry/fsimpl/sockfs/sockfs.go

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD
new file mode 100644
index 000000000..790d50e65
--- /dev/null
+++ b/pkg/sentry/fsimpl/sockfs/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "sockfs",
+    srcs = ["sockfs.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/context",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
new file mode 100644
index 000000000..c13511de2
--- /dev/null
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -0,0 +1,73 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sockfs provides a filesystem implementation for anonymous sockets.
+package sockfs
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// NewFilesystem creates a new sockfs filesystem.
+//
+// Note that there should only ever be one instance of sockfs.Filesystem,
+// backing a global socket mount.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem {
+	fs, _, err := filesystemType{}.GetFilesystem(nil, vfsObj, nil, "", vfs.GetFilesystemOptions{})
+	if err != nil {
+		panic("failed to create sockfs filesystem")
+	}
+	return fs
+}
+
+// filesystemType implements vfs.FilesystemType.
+type filesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	fs := &filesystem{}
+	fs.Init(vfsObj, fsType)
+	return fs.VFSFilesystem(), nil, nil
+}
+
+// Name implements FilesystemType.Name.
+//
+// Note that registering sockfs is unnecessary, except for the fact that it
+// will not show up under /proc/filesystems as a result. This is a very minor
+// discrepancy from Linux.
+func (filesystemType) Name() string {
+	return "sockfs"
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return nil, syserror.ENXIO
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index beba29a09..bb7e3cbc3 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -169,6 +169,7 @@ go_library(
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/hostcpu",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 6feda8fa1..0a448b57c 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -50,6 +50,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -225,6 +226,11 @@ type Kernel struct {
 	// by extMu.
 	nextSocketEntry uint64
 
+	// socketMount is a disconnected vfs.Mount, not included in k.vfs,
+	// representing a sockfs.filesystem. socketMount is used to back
+	// VirtualDentries representing anonymous sockets.
+	socketMount *vfs.Mount
+
 	// deviceRegistry is used to save/restore device.SimpleDevices.
 	deviceRegistry struct{} `state:".(*device.Registry)"`
 
@@ -348,6 +354,19 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
 	k.futexes = futex.NewManager()
 	k.netlinkPorts = port.New()
+	if VFS2Enabled {
+		if err := k.vfs.Init(); err != nil {
+			return fmt.Errorf("failed to initialize VFS: %v", err)
+		}
+		fs := sockfs.NewFilesystem(&k.vfs)
+		// NewDisconnectedMount will take an additional reference on fs.
+		defer fs.DecRef()
+		sm, err := k.vfs.NewDisconnectedMount(fs, nil, &vfs.MountOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to initialize socket mount: %v", err)
+		}
+		k.socketMount = sm
+	}
 	return nil
 }
 
@@ -1452,6 +1471,11 @@ func (k *Kernel) ListSockets() []*SocketEntry {
 	return socks
 }
 
+// SocketMount returns the global socket mount.
+func (k *Kernel) SocketMount() *vfs.Mount {
+	return k.socketMount
+}
+
 // supervisorContext is a privileged context.
 type supervisorContext struct {
 	context.NoopSleeper
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 4cf1f76f1..8bf663e8b 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -257,6 +257,8 @@ TEST_P(UnixSocketPairTest, ShutdownWrite) {
 
 TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) {
   // TODO(b/122310852): We should be returning ENXIO and NOT EIO.
+  // TODO(github.dev/issue/1624): This should be resolved in VFS2. Verify
+  // that this is the case and delete the SKIP_IF once we delete VFS1.
   SKIP_IF(IsRunningOnGvisor());
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
-- 
cgit v1.2.3


From c6d5742c21c19f9cf8b964b49b8df935c1303417 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 2 Apr 2020 10:39:56 -0700
Subject: Fix flaky TCPLinger2TimeoutAfterClose test.

The test is flaky in cooperative S/R mode because TCP timers are not restored
across a S/R. This can cause the TCPLinger2 timer to not fire. This change
disables S/R before setting the TCP_LINGER2 timeout.

PiperOrigin-RevId: 304430536
---
 test/syscalls/linux/socket_inet_loopback.cc | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index b24618a88..16888de2a 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -605,15 +605,23 @@ TEST_P(SocketInetLoopbackTest, TCPLinger2TimeoutAfterClose_NoRandomSave) {
                   &conn_addrlen),
       SyscallSucceeds());
 
-  constexpr int kTCPLingerTimeout = 5;
-  EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
-                         &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
-              SyscallSucceedsWithValue(0));
-
-  // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
-  conn_fd.reset();
-
-  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+  // Disable cooperative saves after this point as TCP timers are not restored
+  // across a S/R.
+  {
+    DisableSave ds;
+    constexpr int kTCPLingerTimeout = 5;
+    EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
+                           &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+                SyscallSucceedsWithValue(0));
+
+    // close the connecting FD to trigger FIN_WAIT2  on the connected fd.
+    conn_fd.reset();
+
+    absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+
+    // ds going out of scope will Re-enable S/R's since at this point the timer
+    // must have fired and cleaned up the endpoint.
+  }
 
   // Now bind and connect a new socket and verify that we can immediately
   // rebind the address bound by the conn_fd as it never entered TIME_WAIT.
-- 
cgit v1.2.3


From 5b2396d244ed6283d928a72bdd4cc58d78ef3175 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 2 Apr 2020 17:06:19 -0700
Subject: Fix typo in TODO comments.

PiperOrigin-RevId: 304508083
---
 pkg/sentry/fs/proc/mounts.go                | 3 ++-
 pkg/sentry/fsimpl/tmpfs/filesystem.go       | 6 +++---
 pkg/sentry/fsimpl/tmpfs/stat_test.go        | 4 ++--
 pkg/sentry/fsimpl/tmpfs/tmpfs.go            | 2 +-
 pkg/sentry/socket/netstack/netstack.go      | 2 +-
 pkg/sentry/vfs/mount.go                     | 3 ++-
 test/syscalls/linux/socket_inet_loopback.cc | 2 +-
 7 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 94deb553b..1fc9c703c 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -170,7 +170,8 @@ func superBlockOpts(mountPath string, msrc *fs.MountSource) string {
 	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
 	// the cgroup name in the options. For now we just read that from the
 	// path.
-	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
+	//
+	// TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
 	// should get this value from the cgroup itself, and not rely on the
 	// path.
 	if msrc.FilesystemType == "cgroup" {
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index e678ecc37..4cf27bf13 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -57,7 +57,7 @@ afterSymlink:
 	}
 	next := nextVFSD.Impl().(*dentry)
 	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO(gvisor.dev/issues/1197): Symlink traversals updates
+		// TODO(gvisor.dev/issue/1197): Symlink traversals updates
 		// access time.
 		if err := rp.HandleSymlink(symlink.target); err != nil {
 			return nil, err
@@ -515,7 +515,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		oldParent.inode.decLinksLocked()
 		newParent.inode.incLinksLocked()
 	}
-	// TODO(gvisor.dev/issues/1197): Update timestamps and parent directory
+	// TODO(gvisor.dev/issue/1197): Update timestamps and parent directory
 	// sizes.
 	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
 	return nil
@@ -600,7 +600,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO(gvisor.dev/issues/1197): Actually implement statfs.
+	// TODO(gvisor.dev/issue/1197): Actually implement statfs.
 	return linux.Statfs{}, syserror.ENOSYS
 }
 
diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go
index ebe035dee..3e02e7190 100644
--- a/pkg/sentry/fsimpl/tmpfs/stat_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go
@@ -29,7 +29,7 @@ func TestStatAfterCreate(t *testing.T) {
 	mode := linux.FileMode(0644)
 
 	// Run with different file types.
-	// TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
+	// TODO(gvisor.dev/issue/1197): Also test symlinks and sockets.
 	for _, typ := range []string{"file", "dir", "pipe"} {
 		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
 			var (
@@ -169,7 +169,7 @@ func TestSetStat(t *testing.T) {
 	mode := linux.FileMode(0644)
 
 	// Run with different file types.
-	// TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
+	// TODO(gvisor.dev/issue/1197): Also test symlinks and sockets.
 	for _, typ := range []string{"file", "dir", "pipe"} {
 		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
 			var (
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 8bc8818c0..54da15849 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -315,7 +315,7 @@ func (i *inode) statTo(stat *linux.Statx) {
 	stat.Atime = linux.NsecToStatxTimestamp(i.atime)
 	stat.Ctime = linux.NsecToStatxTimestamp(i.ctime)
 	stat.Mtime = linux.NsecToStatxTimestamp(i.mtime)
-	// TODO(gvisor.dev/issues/1197): Device number.
+	// TODO(gvisor.dev/issue/1197): Device number.
 	switch impl := i.impl.(type) {
 	case *regularFile:
 		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index f14c336b9..06a5b53bc 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -663,7 +663,7 @@ func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error
 // This is a hack to work around the fact that both IPv4 and IPv6 ANY are
 // represented by the empty string.
 //
-// TODO(gvisor.dev/issues/1556): remove this function.
+// TODO(gvisor.dev/issue/1556): remove this function.
 func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
 	if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
 		addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 7792eb1a0..1b8ecc415 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -835,7 +835,8 @@ func superBlockOpts(mountPath string, mnt *Mount) string {
 	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
 	// the cgroup name in the options. For now we just read that from the
 	// path.
-	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
+	//
+	// TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
 	// should get this value from the cgroup itself, and not rely on the
 	// path.
 	if mnt.fs.FilesystemType().Name() == "cgroup" {
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 16888de2a..2ffc86382 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -234,7 +234,7 @@ TEST_P(DualStackSocketTest, AddressOperations) {
   }
 }
 
-// TODO(gvisor.dev/issues/1556): uncomment V4MappedAny.
+// TODO(gvisor.dev/issue/1556): uncomment V4MappedAny.
 INSTANTIATE_TEST_SUITE_P(
     All, DualStackSocketTest,
     ::testing::Combine(
-- 
cgit v1.2.3


From 1921c246a9907cd1623af4aabde086af9cf172d8 Mon Sep 17 00:00:00 2001
From: gVisor bot <gvisor-bot@google.com>
Date: Fri, 3 Apr 2020 10:19:42 -0700
Subject: Internal change.

PiperOrigin-RevId: 304641990
---
 test/syscalls/linux/proc_net_unix.cc | 6 +++---
 test/syscalls/linux/pty.cc           | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
index 66db0acaa..a63067586 100644
--- a/test/syscalls/linux/proc_net_unix.cc
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -106,7 +106,7 @@ PosixErrorOr<std::vector<UnixEntry>> ProcNetUnixEntries() {
   std::vector<UnixEntry> entries;
   std::vector<std::string> lines = absl::StrSplit(content, '\n');
   std::cerr << "<contents of /proc/net/unix>" << std::endl;
-  for (std::string line : lines) {
+  for (const std::string& line : lines) {
     // Emit the proc entry to the test output to provide context for the test
     // results.
     std::cerr << line << std::endl;
@@ -374,7 +374,7 @@ TEST(ProcNetUnix, DgramSocketStateDisconnectingOnBind) {
   // corresponding entries, as they don't have an address yet.
   if (IsRunningOnGvisor()) {
     ASSERT_EQ(entries.size(), 2);
-    for (auto e : entries) {
+    for (const auto& e : entries) {
       ASSERT_EQ(e.state, SS_DISCONNECTING);
     }
   }
@@ -403,7 +403,7 @@ TEST(ProcNetUnix, DgramSocketStateConnectingOnConnect) {
   // corresponding entries, as they don't have an address yet.
   if (IsRunningOnGvisor()) {
     ASSERT_EQ(entries.size(), 2);
-    for (auto e : entries) {
+    for (const auto& e : entries) {
       ASSERT_EQ(e.state, SS_DISCONNECTING);
     }
   }
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index dafe64d20..b8a0159ba 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -1126,7 +1126,7 @@ TEST_F(PtyTest, SwitchTwiceMultiline) {
   std::string kExpected = "GO\nBLUE\n!";
 
   // Write each line.
-  for (std::string input : kInputs) {
+  for (const std::string& input : kInputs) {
     ASSERT_THAT(WriteFd(master_.get(), input.c_str(), input.size()),
                 SyscallSucceedsWithValue(input.size()));
   }
-- 
cgit v1.2.3


From ea98693d915ebb55bb6b93797bc58d7675ffbe9d Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Fri, 3 Apr 2020 11:37:16 -0700
Subject: Add missing newline

PiperOrigin-RevId: 304659346
---
 test/syscalls/linux/exec.cc                 | 10 ++++++----
 test/syscalls/linux/poll.cc                 |  2 +-
 test/syscalls/linux/proc_pid_smaps.cc       |  4 ++--
 test/syscalls/linux/ptrace.cc               |  2 +-
 test/syscalls/linux/sendfile_socket.cc      |  2 +-
 test/syscalls/linux/socket_inet_loopback.cc |  2 +-
 test/syscalls/linux/socket_netlink_route.cc |  2 +-
 test/util/capability_util.cc                |  4 ++--
 8 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 07bd527e6..12c9b05ca 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -812,26 +812,28 @@ void ExecFromThread() {
 bool ValidateProcCmdlineVsArgv(const int argc, const char* const* argv) {
   auto contents_or = GetContents("/proc/self/cmdline");
   if (!contents_or.ok()) {
-    std::cerr << "Unable to get /proc/self/cmdline: " << contents_or.error();
+    std::cerr << "Unable to get /proc/self/cmdline: " << contents_or.error()
+              << std::endl;
     return false;
   }
   auto contents = contents_or.ValueOrDie();
   if (contents.back() != '\0') {
-    std::cerr << "Non-null terminated /proc/self/cmdline!";
+    std::cerr << "Non-null terminated /proc/self/cmdline!" << std::endl;
     return false;
   }
   contents.pop_back();
   std::vector<std::string> procfs_cmdline = absl::StrSplit(contents, '\0');
 
   if (static_cast<int>(procfs_cmdline.size()) != argc) {
-    std::cerr << "argc = " << argc << " != " << procfs_cmdline.size();
+    std::cerr << "argc = " << argc << " != " << procfs_cmdline.size()
+              << std::endl;
     return false;
   }
 
   for (int i = 0; i < argc; ++i) {
     if (procfs_cmdline[i] != argv[i]) {
       std::cerr << "Procfs command line argument " << i << " mismatch "
-                << procfs_cmdline[i] << " != " << argv[i];
+                << procfs_cmdline[i] << " != " << argv[i] << std::endl;
       return false;
     }
   }
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
index c42472474..1e35a4a8b 100644
--- a/test/syscalls/linux/poll.cc
+++ b/test/syscalls/linux/poll.cc
@@ -266,7 +266,7 @@ TEST_F(PollTest, Nfds) {
   }
 
   rlim_t max_fds = rlim.rlim_cur;
-  std::cout << "Using limit: " << max_fds;
+  std::cout << "Using limit: " << max_fds << std::endl;
 
   // Create an eventfd. Since its value is initially zero, it is writable.
   FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
diff --git a/test/syscalls/linux/proc_pid_smaps.cc b/test/syscalls/linux/proc_pid_smaps.cc
index 7f2e8f203..9fb1b3a2c 100644
--- a/test/syscalls/linux/proc_pid_smaps.cc
+++ b/test/syscalls/linux/proc_pid_smaps.cc
@@ -173,7 +173,7 @@ PosixErrorOr<std::vector<ProcPidSmapsEntry>> ParseProcPidSmaps(
       return;
     }
     unknown_fields.insert(std::string(key));
-    std::cerr << "skipping unknown smaps field " << key;
+    std::cerr << "skipping unknown smaps field " << key << std::endl;
   };
 
   auto lines = absl::StrSplit(contents, '\n', absl::SkipEmpty());
@@ -191,7 +191,7 @@ PosixErrorOr<std::vector<ProcPidSmapsEntry>> ParseProcPidSmaps(
     // amount of whitespace).
     if (!entry) {
       std::cerr << "smaps line not considered a maps line: "
-                << maybe_maps_entry.error_message();
+                << maybe_maps_entry.error_message() << std::endl;
       return PosixError(
           EINVAL,
           absl::StrCat("smaps field line without preceding maps line: ", l));
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index bfe3e2603..cb828ff88 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -1188,7 +1188,7 @@ TEST(PtraceTest, SeizeSetOptions) {
     // gVisor is not susceptible to this race because
     // kernel.Task.waitCollectTraceeStopLocked() checks specifically for an
     // active ptraceStop, which is not initiated if SIGKILL is pending.
-    std::cout << "Observed syscall-exit after SIGKILL";
+    std::cout << "Observed syscall-exit after SIGKILL" << std::endl;
     ASSERT_THAT(waitpid(child_pid, &status, 0),
                 SyscallSucceedsWithValue(child_pid));
   }
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index 8f7ee4163..e94672679 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -149,7 +149,7 @@ TEST_P(SendFileTest, SendMultiple) {
   for (size_t sent = 0; sent < data.size(); cnt++) {
     const size_t remain = data.size() - sent;
     std::cout << "sendfile, size=" << data.size() << ", sent=" << sent
-              << ", remain=" << remain;
+              << ", remain=" << remain << std::endl;
 
     // Send data and verify that sendfile returns the correct value.
     int res = sendfile(client.get(), inf.get(), nullptr, remain);
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 2ffc86382..1b34e4ef7 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -2212,7 +2212,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, PortReuseTwoSockets) {
           setsockopt(fd2, SOL_SOCKET, SO_REUSEPORT, &portreuse2, sizeof(int)),
           SyscallSucceeds());
 
-      std::cout << portreuse1 << " " << portreuse2;
+      std::cout << portreuse1 << " " << portreuse2 << std::endl;
       int ret = bind(fd2, reinterpret_cast<sockaddr*>(&addr), addrlen);
 
       // Verify that two sockets can be bound to the same port only if
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index e5aed1eec..2efb96bc3 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -152,7 +152,7 @@ TEST(NetlinkRouteTest, GetLinkDump) {
     const struct ifinfomsg* msg =
         reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
     std::cout << "Found interface idx=" << msg->ifi_index
-              << ", type=" << std::hex << msg->ifi_type;
+              << ", type=" << std::hex << msg->ifi_type << std::endl;
     if (msg->ifi_type == ARPHRD_LOOPBACK) {
       loopbackFound = true;
       EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
diff --git a/test/util/capability_util.cc b/test/util/capability_util.cc
index 9fee52fbb..a1b994c45 100644
--- a/test/util/capability_util.cc
+++ b/test/util/capability_util.cc
@@ -63,13 +63,13 @@ PosixErrorOr<bool> CanCreateUserNamespace() {
     // is in a chroot environment (i.e., the caller's root directory does
     // not match the root directory of the mount namespace in which it
     // resides)."
-    std::cerr << "clone(CLONE_NEWUSER) failed with EPERM";
+    std::cerr << "clone(CLONE_NEWUSER) failed with EPERM" << std::endl;
     return false;
   } else if (errno == EUSERS) {
     // "(since Linux 3.11) CLONE_NEWUSER was specified in flags, and the call
     // would cause the limit on the number of nested user namespaces to be
     // exceeded. See user_namespaces(7)."
-    std::cerr << "clone(CLONE_NEWUSER) failed with EUSERS";
+    std::cerr << "clone(CLONE_NEWUSER) failed with EUSERS" << std::endl;
     return false;
   } else {
     // Unexpected error code; indicate an actual error.
-- 
cgit v1.2.3


From a94309628ebbc2e6c4997890f1b966fa7a16be20 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 3 Apr 2020 13:39:45 -0700
Subject: Ensure EOF is handled propertly during splice.

PiperOrigin-RevId: 304684417
---
 pkg/sentry/kernel/pipe/pipe.go  | 13 ++++++++++---
 test/syscalls/linux/sendfile.cc | 28 ++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 725e9db7d..62c8691f1 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -255,7 +255,8 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
 	// atomic, but requires no atomicity for writes larger than this.
 	wanted := ops.left()
-	if avail := p.max - p.view.Size(); wanted > avail {
+	avail := p.max - p.view.Size()
+	if wanted > avail {
 		if wanted <= p.atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
@@ -268,8 +269,14 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 		return done, err
 	}
 
-	if wanted > done {
-		// Partial write due to full pipe.
+	if done < avail {
+		// Non-failure, but short write.
+		return done, nil
+	}
+	if done < wanted {
+		// Partial write due to full pipe. Note that this could also be
+		// the short write case above, we would expect a second call
+		// and the write to return zero bytes in this case.
 		return done, syserror.ErrWouldBlock
 	}
 
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index 580ab5193..ebaafe47e 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -530,6 +530,34 @@ TEST(SendFileTest, SendToSpecialFile) {
               SyscallSucceedsWithValue(kSize & (~7)));
 }
 
+TEST(SendFileTest, SendFileToPipe) {
+  // Create temp file.
+  constexpr char kData[] = "<insert-quote-here>";
+  constexpr int kDataSize = sizeof(kData) - 1;
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Create a pipe for sending to a pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Expect to read up to the given size.
+  std::vector<char> buf(kDataSize);
+  ScopedThread t([&]() {
+    absl::SleepFor(absl::Milliseconds(100));
+    ASSERT_THAT(read(rfd.get(), buf.data(), buf.size()),
+                SyscallSucceedsWithValue(kDataSize));
+  });
+
+  // Send with twice the size of the file, which should hit EOF.
+  EXPECT_THAT(sendfile(wfd.get(), inf.get(), nullptr, kDataSize * 2),
+              SyscallSucceedsWithValue(kDataSize));
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 4baa7e70795edbb350d55a9365807341515d3af4 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Mon, 6 Apr 2020 09:50:13 -0700
Subject: Bump up acceptable sample count for flaky itimer test.

Running the test 1000x almost always produces 1+ test failures where
the sample count is slightly more than 60.

PiperOrigin-RevId: 305051754
---
 test/syscalls/linux/itimer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index 8b48f0804..dd981a278 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -246,7 +246,7 @@ int TestSIGPROFFairness(absl::Duration sleep) {
 
   // The number of samples on the main thread should be very low as it did
   // nothing.
-  TEST_CHECK(result.main_thread_samples < 60);
+  TEST_CHECK(result.main_thread_samples < 80);
 
   // Both workers should get roughly equal number of samples.
   TEST_CHECK(result.worker_samples.size() == 2);
-- 
cgit v1.2.3


From 94319a8241cb299edc812024d6132b7a3819a4dc Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 7 Apr 2020 09:40:38 -0700
Subject: Make gofer.dentry.destroyLocked idempotent

gofer operations accumulate dentries touched in a slice to call
checkCachingLocked on them when the operation is over. In case
the same dentry is touched multiple times during the operation,
checkCachingLocked, and consequently destroyLocked, may be called
more than once for the same dentry.

Updates #1198

PiperOrigin-RevId: 305276819
---
 pkg/sentry/fsimpl/gofer/BUILD         | 12 ++++++-
 pkg/sentry/fsimpl/gofer/gofer.go      | 36 +++++++++++++++++---
 pkg/sentry/fsimpl/gofer/gofer_test.go | 64 +++++++++++++++++++++++++++++++++++
 test/syscalls/linux/open.cc           | 22 ++++++++++++
 4 files changed, 129 insertions(+), 5 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/gofer/gofer_test.go

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index d15a36709..99d1e3f8f 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
@@ -54,3 +54,13 @@ go_library(
         "//pkg/usermem",
     ],
 )
+
+go_test(
+    name = "gofer_test",
+    srcs = ["gofer_test.go"],
+    library = ":gofer",
+    deps = [
+        "//pkg/p9",
+        "//pkg/sentry/contexttest",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index adee8bb60..20edaf643 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -444,7 +444,8 @@ type dentry struct {
 
 	// refs is the reference count. Each dentry holds a reference on its
 	// parent, even if disowned. refs is accessed using atomic memory
-	// operations.
+	// operations. When refs reaches 0, the dentry may be added to the cache or
+	// destroyed. If refs==-1 the dentry has already been destroyed.
 	refs int64
 
 	// fs is the owning filesystem. fs is immutable.
@@ -860,7 +861,7 @@ func (d *dentry) IncRef() {
 func (d *dentry) TryIncRef() bool {
 	for {
 		refs := atomic.LoadInt64(&d.refs)
-		if refs == 0 {
+		if refs <= 0 {
 			return false
 		}
 		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
@@ -883,13 +884,20 @@ func (d *dentry) DecRef() {
 // checkCachingLocked should be called after d's reference count becomes 0 or it
 // becomes disowned.
 //
+// It may be called on a destroyed dentry. For example,
+// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
+// for the same dentry when the dentry is visited more than once in the same
+// operation. One of the calls may destroy the dentry, so subsequent calls will
+// do nothing.
+//
 // Preconditions: d.fs.renameMu must be locked for writing.
 func (d *dentry) checkCachingLocked() {
 	// Dentries with a non-zero reference count must be retained. (The only way
 	// to obtain a reference on a dentry with zero references is via path
 	// resolution, which requires renameMu, so if d.refs is zero then it will
 	// remain zero while we hold renameMu for writing.)
-	if atomic.LoadInt64(&d.refs) != 0 {
+	refs := atomic.LoadInt64(&d.refs)
+	if refs > 0 {
 		if d.cached {
 			d.fs.cachedDentries.Remove(d)
 			d.fs.cachedDentriesLen--
@@ -897,6 +905,10 @@ func (d *dentry) checkCachingLocked() {
 		}
 		return
 	}
+	if refs == -1 {
+		// Dentry has already been destroyed.
+		return
+	}
 	// Non-child dentries with zero references are no longer reachable by path
 	// resolution and should be dropped immediately.
 	if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() {
@@ -949,9 +961,22 @@ func (d *dentry) checkCachingLocked() {
 	}
 }
 
+// destroyLocked destroys the dentry. It may flushes dirty pages from cache,
+// close p9 file and remove reference on parent dentry.
+//
 // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is
 // not a child dentry.
 func (d *dentry) destroyLocked() {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("dentry.destroyLocked() called with references on the dentry")
+	}
+
 	ctx := context.Background()
 	d.handleMu.Lock()
 	if !d.handle.file.isNil() {
@@ -971,7 +996,10 @@ func (d *dentry) destroyLocked() {
 		d.handle.close(ctx)
 	}
 	d.handleMu.Unlock()
-	d.file.close(ctx)
+	if !d.file.isNil() {
+		d.file.close(ctx)
+		d.file = p9file{}
+	}
 	// Remove d from the set of all dentries.
 	d.fs.syncMu.Lock()
 	delete(d.fs.dentries, d)
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
new file mode 100644
index 000000000..82bc239db
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync/atomic"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+)
+
+func TestDestroyIdempotent(t *testing.T) {
+	fs := filesystem{
+		dentries: make(map[*dentry]struct{}),
+		opts: filesystemOptions{
+			// Test relies on no dentry being held in the cache.
+			maxCachedDentries: 0,
+		},
+	}
+
+	ctx := contexttest.Context(t)
+	attr := &p9.Attr{
+		Mode: p9.ModeRegular,
+	}
+	mask := p9.AttrMask{
+		Mode: true,
+		Size: true,
+	}
+	parent, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+	if err != nil {
+		t.Fatalf("fs.newDentry(): %v", err)
+	}
+
+	child, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+	if err != nil {
+		t.Fatalf("fs.newDentry(): %v", err)
+	}
+	parent.IncRef() // reference held by child on its parent.
+	parent.vfsd.InsertChild(&child.vfsd, "child")
+
+	child.checkCachingLocked()
+	if got := atomic.LoadInt64(&child.refs); got != -1 {
+		t.Fatalf("child.refs=%d, want: -1", got)
+	}
+	// Parent will also be destroyed when child reference is removed.
+	if got := atomic.LoadInt64(&parent.refs); got != -1 {
+		t.Fatalf("parent.refs=%d, want: -1", got)
+	}
+	child.checkCachingLocked()
+	child.checkCachingLocked()
+}
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index 267ae19f6..640fe6bfc 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -186,6 +186,28 @@ TEST_F(OpenTest, OpenNoFollowStillFollowsLinksInPath) {
       ASSERT_NO_ERRNO_AND_VALUE(Open(path_via_symlink, O_RDONLY | O_NOFOLLOW));
 }
 
+// Test that open(2) can follow symlinks that point back to the same tree.
+// Test sets up files as follows:
+//   root/child/symlink => redirects to ../..
+//   root/child/target => regular file
+//
+// open("root/child/symlink/root/child/file")
+TEST_F(OpenTest, SymlinkRecurse) {
+  auto root =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(GetAbsoluteTestTmpdir()));
+  auto child = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+  auto symlink = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(child.path(), "../.."));
+  auto target = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(child.path(), "abc", 0644));
+  auto path_via_symlink =
+      JoinPath(symlink.path(), Basename(root.path()), Basename(child.path()),
+               Basename(target.path()));
+  const auto contents =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents(path_via_symlink));
+  ASSERT_EQ(contents, "abc");
+}
+
 TEST_F(OpenTest, Fault) {
   char* totally_not_null = nullptr;
   ASSERT_THAT(open(totally_not_null, O_RDONLY), SyscallFailsWithErrno(EFAULT));
-- 
cgit v1.2.3


From 71770e56629339c9853466e994b78b172bc668a9 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Tue, 7 Apr 2020 13:27:26 -0700
Subject: mkdir test: Address TODOs and re-enable a test.

PiperOrigin-RevId: 305328184
---
 test/syscalls/linux/mkdir.cc | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index def4c50a4..4036a9275 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -36,21 +36,12 @@ class MkdirTest : public ::testing::Test {
 
   // TearDown unlinks created files.
   void TearDown() override {
-    // FIXME(edahlgren): We don't currently implement rmdir.
-    // We do this unconditionally because there's no harm in trying.
-    rmdir(dirname_.c_str());
+    EXPECT_THAT(rmdir(dirname_.c_str()), SyscallSucceeds());
   }
 
   std::string dirname_;
 };
 
-TEST_F(MkdirTest, DISABLED_CanCreateReadbleDir) {
-  ASSERT_THAT(mkdir(dirname_.c_str(), 0444), SyscallSucceeds());
-  ASSERT_THAT(
-      open(JoinPath(dirname_, "anything").c_str(), O_RDWR | O_CREAT, 0666),
-      SyscallFailsWithErrno(EACCES));
-}
-
 TEST_F(MkdirTest, CanCreateWritableDir) {
   ASSERT_THAT(mkdir(dirname_.c_str(), 0777), SyscallSucceeds());
   std::string filename = JoinPath(dirname_, "anything");
@@ -84,10 +75,11 @@ TEST_F(MkdirTest, FailsOnDirWithoutWritePerms) {
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
 
-  auto parent = ASSERT_NO_ERRNO_AND_VALUE(
-      TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0555));
-  auto dir = JoinPath(parent.path(), "foo");
-  ASSERT_THAT(mkdir(dir.c_str(), 0777), SyscallFailsWithErrno(EACCES));
+  ASSERT_THAT(mkdir(dirname_.c_str(), 0555), SyscallSucceeds());
+  auto dir = JoinPath(dirname_.c_str(), "foo");
+  EXPECT_THAT(mkdir(dir.c_str(), 0777), SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(open(JoinPath(dirname_, "file").c_str(), O_RDWR | O_CREAT, 0666),
+              SyscallFailsWithErrno(EACCES));
 }
 
 }  // namespace
-- 
cgit v1.2.3


From c7d841ac6e0be2aaacd6a3a81786508be797f667 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 8 Apr 2020 00:25:16 -0700
Subject: tests: Specify NoRandomSave for PortReuse tests

SO_REUSEPORT is not properly restored:
https://github.com/google/gvisor/issues/873

PiperOrigin-RevId: 305422775
---
 test/syscalls/linux/socket_inet_loopback.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 1b34e4ef7..030c3b835 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -1157,7 +1157,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
                 EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
 }
 
-TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
+TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -1270,7 +1270,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
                 EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
 }
 
-TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
+TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -2146,8 +2146,9 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) {
                          &kSockOptOn, sizeof(kSockOptOn)),
               SyscallSucceeds());
 
-  ASSERT_THAT(connect(connected_fd.get(),
-                      reinterpret_cast<sockaddr*>(&bound_addr), bound_addr_len),
+  ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+                                  reinterpret_cast<sockaddr*>(&bound_addr),
+                                  bound_addr_len),
               SyscallSucceeds());
 
   // Get the ephemeral port.
-- 
cgit v1.2.3


From a86ffefd3f52dede3ffd6ae3c20d67734ecc2616 Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Wed, 8 Apr 2020 04:06:14 -0400
Subject: Enable exec_binary syscall test on Arm64

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/syscalls/linux/BUILD          |   5 +-
 test/syscalls/linux/exec_binary.cc | 164 ++++++++++++++++++++++++++++++++-----
 2 files changed, 143 insertions(+), 26 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index d0c431234..9447b06a8 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -663,10 +663,7 @@ cc_binary(
 cc_binary(
     name = "exec_binary_test",
     testonly = 1,
-    srcs = select_arch(
-        amd64 = ["exec_binary.cc"],
-        arm64 = [],
-    ),
+    srcs = ["exec_binary.cc"],
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 736452b0c..ae2683256 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -48,10 +48,17 @@ namespace {
 using ::testing::AnyOf;
 using ::testing::Eq;
 
-#ifndef __x86_64__
+#if !defined(__x86_64__) && !defined(__aarch64__)
 // The assembly stub and ELF internal details must be ported to other arches.
-#error "Test only supported on x86-64"
-#endif  // __x86_64__
+#error "Test only supported on x86-64/arm64"
+#endif  // __x86_64__ || __aarch64__
+
+#if defined(__x86_64__)
+#define EM_TYPE EM_X86_64
+#define IP_REG(p) ((p).rip)
+#define RAX_REG(p) ((p).rax)
+#define RDI_REG(p) ((p).rdi)
+#define RETURN_REG(p) ((p).rax)
 
 // amd64 stub that calls PTRACE_TRACEME and sends itself SIGSTOP.
 const char kPtraceCode[] = {
@@ -139,6 +146,76 @@ const char kPtraceCode[] = {
 // Size of a syscall instruction.
 constexpr int kSyscallSize = 2;
 
+#elif defined(__aarch64__)
+#define EM_TYPE EM_AARCH64
+#define IP_REG(p) ((p).pc)
+#define RAX_REG(p) ((p).regs[8])
+#define RDI_REG(p) ((p).regs[0])
+#define RETURN_REG(p) ((p).regs[0])
+
+const char kPtraceCode[] = {
+    // MOVD $117, R8 /* ptrace */
+    '\xa8',
+    '\x0e',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R0 /* PTRACE_TRACEME */
+    '\x00',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R1 /* pid */
+    '\x01',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R2 /* addr */
+    '\x02',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // MOVD $0, R3 /* data */
+    '\x03',
+    '\x00',
+    '\x80',
+    '\xd2',
+    // SVC
+    '\x01',
+    '\x00',
+    '\x00',
+    '\xd4',
+    // MOVD $172, R8 /* getpid */
+    '\x88',
+    '\x15',
+    '\x80',
+    '\xd2',
+    // SVC
+    '\x01',
+    '\x00',
+    '\x00',
+    '\xd4',
+    // MOVD $129, R8 /* kill, R0=pid */
+    '\x28',
+    '\x10',
+    '\x80',
+    '\xd2',
+    // MOVD $19, R1  /* SIGSTOP */
+    '\x61',
+    '\x02',
+    '\x80',
+    '\xd2',
+    // SVC
+    '\x01',
+    '\x00',
+    '\x00',
+    '\xd4',
+};
+// Size of a syscall instruction.
+constexpr int kSyscallSize = 4;
+#else
+#error "Unknown architecture"
+#endif
+
 // This test suite tests executable loading in the kernel (ELF and interpreter
 // scripts).
 
@@ -281,7 +358,7 @@ ElfBinary<64> StandardElf() {
   elf.header.e_ident[EI_DATA] = ELFDATA2LSB;
   elf.header.e_ident[EI_VERSION] = EV_CURRENT;
   elf.header.e_type = ET_EXEC;
-  elf.header.e_machine = EM_X86_64;
+  elf.header.e_machine = EM_TYPE;
   elf.header.e_version = EV_CURRENT;
   elf.header.e_phoff = sizeof(elf.header);
   elf.header.e_phentsize = sizeof(decltype(elf)::ElfPhdr);
@@ -327,9 +404,15 @@ TEST(ElfTest, Execute) {
   ASSERT_NO_ERRNO(WaitStopped(child));
 
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
-  // RIP is just beyond the final syscall instruction.
-  EXPECT_EQ(regs.rip, elf.header.e_entry + sizeof(kPtraceCode));
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+ // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
+  // RIP/PC is just beyond the final syscall instruction.
+  EXPECT_EQ(IP_REG(regs), elf.header.e_entry + sizeof(kPtraceCode));
 
   EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
                          {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
@@ -718,9 +801,16 @@ TEST(ElfTest, PIE) {
 
   // RIP tells us which page the first segment was loaded into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
 
-  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
+
+  const uint64_t load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
                          // text page.
@@ -787,9 +877,15 @@ TEST(ElfTest, PIENonZeroStart) {
 
   // RIP tells us which page the first segment was loaded into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-  const uint64_t load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   // The ELF is loaded at an arbitrary address, not the first PT_LOAD vaddr.
   //
@@ -910,9 +1006,15 @@ TEST(ElfTest, ELFInterpreter) {
   // RIP tells us which page the first segment of the interpreter was loaded
   // into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t interp_load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   EXPECT_THAT(
       child, ContainsMappings(std::vector<ProcMapsEntry>({
@@ -1084,9 +1186,15 @@ TEST(ElfTest, ELFInterpreterRelative) {
   // RIP tells us which page the first segment of the interpreter was loaded
   // into.
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+              SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-  const uint64_t interp_load_addr = regs.rip & ~(kPageSize - 1);
+  const uint64_t interp_load_addr = IP_REG(regs) & ~(kPageSize - 1);
 
   EXPECT_THAT(
       child, ContainsMappings(std::vector<ProcMapsEntry>({
@@ -1480,14 +1588,21 @@ TEST(ExecveTest, BrkAfterBinary) {
   ASSERT_NO_ERRNO(WaitStopped(child));
 
   struct user_regs_struct regs;
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  struct iovec iov;
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+    SyscallSucceeds());
+  // Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
   // RIP is just beyond the final syscall instruction. Rewind to execute a brk
   // syscall.
-  regs.rip -= kSyscallSize;
-  regs.rax = __NR_brk;
-  regs.rdi = 0;
-  ASSERT_THAT(ptrace(PTRACE_SETREGS, child, 0, &regs), SyscallSucceeds());
+  IP_REG(regs) -= kSyscallSize;
+  RAX_REG(regs) = __NR_brk;
+  RDI_REG(regs) = 0;
+  ASSERT_THAT(ptrace(PTRACE_SETREGSET, child, NT_PRSTATUS, &iov),
+    SyscallSucceeds());
 
   // Resume the child, waiting for syscall entry.
   ASSERT_THAT(ptrace(PTRACE_SYSCALL, child, 0, 0), SyscallSucceeds());
@@ -1504,7 +1619,12 @@ TEST(ExecveTest, BrkAfterBinary) {
   ASSERT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
       << "status = " << status;
 
-  ASSERT_THAT(ptrace(PTRACE_GETREGS, child, 0, &regs), SyscallSucceeds());
+  iov.iov_base = &regs;
+  iov.iov_len = sizeof(regs);
+  EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+    SyscallSucceeds());
+  //Read exactly the full register set.
+  EXPECT_EQ(iov.iov_len, sizeof(regs));
 
   // brk is after the text page.
   //
@@ -1512,7 +1632,7 @@ TEST(ExecveTest, BrkAfterBinary) {
   // address will be, but it is always beyond the final page in the binary.
   // i.e., it does not start immediately after memsz in the middle of a page.
   // Userspace may expect to use that space.
-  EXPECT_GE(regs.rax, 0x41000);
+  EXPECT_GE(RETURN_REG(regs), 0x41000);
 }
 
 }  // namespace
-- 
cgit v1.2.3


From b30130567d81157e39b692e0116f86015f0bcc71 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 8 Apr 2020 13:33:44 -0700
Subject: Enable SubprocessExited and SubprocessZombie for gVisor

Updates #164

PiperOrigin-RevId: 305544029
---
 pkg/sentry/fs/proc/task.go           | 28 ++++++++++++++++--
 pkg/sentry/fsimpl/proc/task.go       | 16 -----------
 pkg/sentry/fsimpl/proc/task_files.go | 56 ++++++++++++++++++++++++++++++++++--
 test/syscalls/linux/proc.cc          | 31 ++++++++------------
 4 files changed, 90 insertions(+), 41 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index d6c5dd2c1..4d42eac83 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -57,6 +57,16 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) {
 	return m, nil
 }
 
+func checkTaskState(t *kernel.Task) error {
+	switch t.ExitState() {
+	case kernel.TaskExitZombie:
+		return syserror.EACCES
+	case kernel.TaskExitDead:
+		return syserror.ESRCH
+	}
+	return nil
+}
+
 // taskDir represents a task-level directory.
 //
 // +stateify savable
@@ -254,11 +264,12 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 }
 
 func (e *exe) executable() (file fsbridge.File, err error) {
+	if err := checkTaskState(e.t); err != nil {
+		return nil, err
+	}
 	e.t.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
-			// TODO(b/34851096): Check shouldn't allow Readlink once the
-			// Task is zombied.
 			err = syserror.EACCES
 			return
 		}
@@ -268,7 +279,7 @@ func (e *exe) executable() (file fsbridge.File, err error) {
 		// (with locks held).
 		file = mm.Executable()
 		if file == nil {
-			err = syserror.ENOENT
+			err = syserror.ESRCH
 		}
 	})
 	return
@@ -313,11 +324,22 @@ func newNamespaceSymlink(t *kernel.Task, msrc *fs.MountSource, name string) *fs.
 	return newProcInode(t, n, msrc, fs.Symlink, t)
 }
 
+// Readlink reads the symlink value.
+func (n *namespaceSymlink) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	if err := checkTaskState(n.t); err != nil {
+		return "", err
+	}
+	return n.Symlink.Readlink(ctx, inode)
+}
+
 // Getlink implements fs.InodeOperations.Getlink.
 func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) {
 	if !kernel.ContextCanTrace(ctx, n.t, false) {
 		return nil, syserror.EACCES
 	}
+	if err := checkTaskState(n.t); err != nil {
+		return nil, err
+	}
 
 	// Create a new regular file to fake the namespace file.
 	iops := fsutil.NewNoReadWriteFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0777), linux.PROC_SUPER_MAGIC)
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index aee2a4392..888afc0fd 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -214,22 +214,6 @@ func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
 	return &ioData{ioUsage: t}
 }
 
-func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
-	// Namespace symlinks should contain the namespace name and the inode number
-	// for the namespace instance, so for example user:[123456]. We currently fake
-	// the inode number by sticking the symlink inode in its place.
-	target := fmt.Sprintf("%s:[%d]", ns, ino)
-
-	inode := &kernfs.StaticSymlink{}
-	// Note: credentials are overridden by taskOwnedInode.
-	inode.Init(task.Credentials(), ino, target)
-
-	taskInode := &taskOwnedInode{Inode: inode, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(taskInode)
-	return d
-}
-
 // newCgroupData creates inode that shows cgroup information.
 // From man 7 cgroups: "For each cgroup hierarchy of which the process is a
 // member, there is one entry containing three colon-separated fields:
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 88ea6a6d8..2c6f8bdfc 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -64,6 +64,16 @@ func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) {
 	return m, nil
 }
 
+func checkTaskState(t *kernel.Task) error {
+	switch t.ExitState() {
+	case kernel.TaskExitZombie:
+		return syserror.EACCES
+	case kernel.TaskExitDead:
+		return syserror.ESRCH
+	}
+	return nil
+}
+
 type bufferWriter struct {
 	buf *bytes.Buffer
 }
@@ -628,11 +638,13 @@ func (s *exeSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, er
 }
 
 func (s *exeSymlink) executable() (file fsbridge.File, err error) {
+	if err := checkTaskState(s.task); err != nil {
+		return nil, err
+	}
+
 	s.task.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
-			// TODO(b/34851096): Check shouldn't allow Readlink once the
-			// Task is zombied.
 			err = syserror.EACCES
 			return
 		}
@@ -642,7 +654,7 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 		// (with locks held).
 		file = mm.Executable()
 		if file == nil {
-			err = syserror.ENOENT
+			err = syserror.ESRCH
 		}
 	})
 	return
@@ -709,3 +721,41 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
 	return nil
 }
+
+type namespaceSymlink struct {
+	kernfs.StaticSymlink
+
+	task *kernel.Task
+}
+
+func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
+	// Namespace symlinks should contain the namespace name and the inode number
+	// for the namespace instance, so for example user:[123456]. We currently fake
+	// the inode number by sticking the symlink inode in its place.
+	target := fmt.Sprintf("%s:[%d]", ns, ino)
+
+	inode := &namespaceSymlink{task: task}
+	// Note: credentials are overridden by taskOwnedInode.
+	inode.Init(task.Credentials(), ino, target)
+
+	taskInode := &taskOwnedInode{Inode: inode, owner: task}
+	d := &kernfs.Dentry{}
+	d.Init(taskInode)
+	return d
+}
+
+// Readlink implements Inode.
+func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
+	if err := checkTaskState(s.task); err != nil {
+		return "", err
+	}
+	return s.StaticSymlink.Readlink(ctx)
+}
+
+// Getlink implements Inode.Getlink.
+func (s *namespaceSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) {
+	if err := checkTaskState(s.task); err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	return s.StaticSymlink.Getlink(ctx)
+}
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 5a70f6c3b..da98e1f66 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1326,8 +1326,6 @@ TEST(ProcPidSymlink, SubprocessRunning) {
               SyscallSucceedsWithValue(sizeof(buf)));
 }
 
-// FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
-// on proc files.
 TEST(ProcPidSymlink, SubprocessZombied) {
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
@@ -1337,7 +1335,7 @@ TEST(ProcPidSymlink, SubprocessZombied) {
   int want = EACCES;
   if (!IsRunningOnGvisor()) {
     auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
-    if (version.major == 4 && version.minor > 3) {
+    if (version.major > 4 || (version.major == 4 && version.minor > 3)) {
       want = ENOENT;
     }
   }
@@ -1350,30 +1348,25 @@ TEST(ProcPidSymlink, SubprocessZombied) {
                 SyscallFailsWithErrno(want));
   }
 
-  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
-  // on proc files.
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between linux on proc
+  // files.
   //
   // ~4.3: Syscall fails with EACCES.
-  // 4.17 & gVisor: Syscall succeeds and returns 1.
+  // 4.17: Syscall succeeds and returns 1.
   //
-  // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
-  //            SyscallFailsWithErrno(EACCES));
+  if (!IsRunningOnGvisor()) {
+    return;
+  }
 
-  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
-  // on proc files.
-  //
-  // ~4.3: Syscall fails with EACCES.
-  // 4.17 & gVisor: Syscall succeeds and returns 1.
-  //
-  // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
-  //            SyscallFailsWithErrno(EACCES));
+  EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
+              SyscallFailsWithErrno(want));
+
+  EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
+              SyscallFailsWithErrno(want));
 }
 
 // Test whether /proc/PID/ symlinks can be read for an exited process.
 TEST(ProcPidSymlink, SubprocessExited) {
-  // FIXME(gvisor.dev/issue/164): These all succeed on gVisor.
-  SKIP_IF(IsRunningOnGvisor());
-
   char buf[1];
 
   EXPECT_THAT(ReadlinkWhileExited("exe", buf, sizeof(buf)),
-- 
cgit v1.2.3


From 2907e6da5e9fc7eeda51644db7bec4d15691b384 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Wed, 8 Apr 2020 13:46:51 -0700
Subject: file test: Remove FIXME about FIFO. It is already tested in mknod
 test.

PiperOrigin-RevId: 305546584
---
 test/syscalls/linux/file_base.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 6f80bc97c..25fdd7106 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -52,17 +52,6 @@ class FileTest : public ::testing::Test {
     test_file_fd_ = ASSERT_NO_ERRNO_AND_VALUE(
         Open(test_file_name_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR));
 
-    // FIXME(edahlgren): enable when mknod syscall is supported.
-    // test_fifo_name_ = NewTempAbsPath();
-    // ASSERT_THAT(mknod(test_fifo_name_.c_str()), S_IFIFO|0644, 0,
-    //             SyscallSucceeds());
-    // ASSERT_THAT(test_fifo_[1] = open(test_fifo_name_.c_str(),
-    //                                             O_WRONLY),
-    //             SyscallSucceeds());
-    // ASSERT_THAT(test_fifo_[0] = open(test_fifo_name_.c_str(),
-    //                                             O_RDONLY),
-    //             SyscallSucceeds());
-
     ASSERT_THAT(pipe(test_pipe_), SyscallSucceeds());
     ASSERT_THAT(fcntl(test_pipe_[0], F_SETFL, O_NONBLOCK), SyscallSucceeds());
   }
@@ -96,18 +85,11 @@ class FileTest : public ::testing::Test {
     CloseFile();
     UnlinkFile();
     ClosePipes();
-
-    // FIXME(edahlgren): enable when mknod syscall is supported.
-    // close(test_fifo_[0]);
-    // close(test_fifo_[1]);
-    // unlink(test_fifo_name_.c_str());
   }
 
   std::string test_file_name_;
-  std::string test_fifo_name_;
   FileDescriptor test_file_fd_;
 
-  int test_fifo_[2];
   int test_pipe_[2];
 };
 
-- 
cgit v1.2.3


From 357f136e42de81b033b65b7f39a4a555275a17e3 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Wed, 8 Apr 2020 14:38:09 -0700
Subject: Handle utimes correctly for shared gofer filesystems.

Determine system time from within the sentry rather than relying on the remote
filesystem to prevent inconsistencies.
Resolve related TODOs; the time discrepancies in question don't exist anymore.

PiperOrigin-RevId: 305557099
---
 pkg/sentry/fs/gofer/util.go   | 16 ++++++++++++++--
 test/syscalls/linux/utimes.cc | 18 +-----------------
 2 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go
index 2d8d3a2ea..47a6c69bf 100644
--- a/pkg/sentry/fs/gofer/util.go
+++ b/pkg/sentry/fs/gofer/util.go
@@ -20,17 +20,29 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 )
 
 func utimes(ctx context.Context, file contextFile, ts fs.TimeSpec) error {
 	if ts.ATimeOmit && ts.MTimeOmit {
 		return nil
 	}
+
+	// Replace requests to use the "system time" with the current time to
+	// ensure that timestamps remain consistent with the remote
+	// filesystem.
+	now := ktime.NowFromContext(ctx)
+	if ts.ATimeSetSystemTime {
+		ts.ATime = now
+	}
+	if ts.MTimeSetSystemTime {
+		ts.MTime = now
+	}
 	mask := p9.SetAttrMask{
 		ATime:              !ts.ATimeOmit,
-		ATimeNotSystemTime: !ts.ATimeSetSystemTime,
+		ATimeNotSystemTime: true,
 		MTime:              !ts.MTimeOmit,
-		MTimeNotSystemTime: !ts.MTimeSetSystemTime,
+		MTimeNotSystemTime: true,
 	}
 	as, ans := ts.ATime.Unix()
 	ms, mns := ts.MTime.Unix()
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
index 3a927a430..22e6d1a85 100644
--- a/test/syscalls/linux/utimes.cc
+++ b/test/syscalls/linux/utimes.cc
@@ -34,17 +34,10 @@ namespace testing {
 
 namespace {
 
-// TODO(b/36516566): utimes(nullptr) does not pick the "now" time in the
-// application's time domain, so when asserting that times are within a window,
-// we expand the window to allow for differences between the time domains.
-constexpr absl::Duration kClockSlack = absl::Milliseconds(100);
-
 // TimeBoxed runs fn, setting before and after to (coarse realtime) times
 // guaranteed* to come before and after fn started and completed, respectively.
 //
 // fn may be called more than once if the clock is adjusted.
-//
-// * See the comment on kClockSlack. gVisor breaks this guarantee.
 void TimeBoxed(absl::Time* before, absl::Time* after,
                std::function<void()> const& fn) {
   do {
@@ -69,12 +62,6 @@ void TimeBoxed(absl::Time* before, absl::Time* after,
       // which could lead to test failures, but that is very unlikely to happen.
       continue;
     }
-
-    if (IsRunningOnGvisor()) {
-      // See comment on kClockSlack.
-      *before -= kClockSlack;
-      *after += kClockSlack;
-    }
   } while (*after < *before);
 }
 
@@ -235,10 +222,7 @@ void TestUtimensat(int dirFd, std::string const& path) {
   EXPECT_GE(mtime3, before);
   EXPECT_LE(mtime3, after);
 
-  if (!IsRunningOnGvisor()) {
-    // FIXME(b/36516566): Gofers set atime and mtime to different "now" times.
-    EXPECT_EQ(atime3, mtime3);
-  }
+  EXPECT_EQ(atime3, mtime3);
 }
 
 TEST(UtimensatTest, OnAbsPath) {
-- 
cgit v1.2.3


From 6dd5a1f3fe55daa8510b1ee5e3a59219aad92af6 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 8 Apr 2020 17:56:55 -0700
Subject: Clean up TODOs

PiperOrigin-RevId: 305592245
---
 pkg/sentry/fs/tmpfs/fs.go              |  3 ---
 pkg/sentry/fsimpl/kernfs/filesystem.go |  2 +-
 pkg/sentry/kernel/ptrace.go            |  1 -
 pkg/sentry/vfs/filesystem.go           |  2 +-
 pkg/sentry/vfs/mount.go                | 12 ++++++------
 pkg/sentry/vfs/mount_test.go           |  2 +-
 runsc/cmd/gofer.go                     |  5 ++---
 test/syscalls/linux/epoll.cc           |  4 ----
 test/syscalls/linux/file_base.h        |  1 +
 test/syscalls/linux/pwrite64.cc        |  9 +--------
 test/syscalls/linux/tuntap.cc          |  7 ++++---
 test/syscalls/linux/write.cc           | 10 ++--------
 tools/go_generics/defs.bzl             |  1 -
 13 files changed, 19 insertions(+), 40 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index d5be56c3f..bc117ca6a 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -44,9 +44,6 @@ const (
 	// lookup.
 	cacheRevalidate = "revalidate"
 
-	// TODO(edahlgren/mpratt): support a tmpfs size limit.
-	// size = "size"
-
 	// Permissions that exceed modeMask will be rejected.
 	modeMask = 01777
 
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 16a3c18ae..4433071aa 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -682,7 +682,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO: actually implement statfs
+	// TODO(gvisor.dev/issue/1193): actually implement statfs.
 	return linux.Statfs{}, syserror.ENOSYS
 }
 
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 35ad97d5d..e23e796ef 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -184,7 +184,6 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
 	if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
 		return false
 	}
-	// TODO: Yama LSM
 	return true
 }
 
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index cd34782ff..bef1bd312 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -497,7 +497,7 @@ type FilesystemImpl interface {
 	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
 	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
 
-	// TODO: inotify_add_watch()
+	// TODO(gvisor.dev/issue/1479): inotify_add_watch()
 }
 
 // PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 1b8ecc415..f06946103 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -233,9 +233,9 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 		}
 		vd.dentry.mu.Lock()
 	}
-	// TODO: Linux requires that either both the mount point and the mount root
-	// are directories, or neither are, and returns ENOTDIR if this is not the
-	// case.
+	// TODO(gvisor.dev/issue/1035): Linux requires that either both the mount
+	// point and the mount root are directories, or neither are, and returns
+	// ENOTDIR if this is not the case.
 	mntns := vd.mount.ns
 	mnt := newMount(vfs, fs, root, mntns, opts)
 	vfs.mounts.seq.BeginWrite()
@@ -274,9 +274,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
 		}
 	}
 
-	// TODO(jamieliu): Linux special-cases umount of the caller's root, which
-	// we don't implement yet (we'll just fail it since the caller holds a
-	// reference on it).
+	// TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
+	// root, which we don't implement yet (we'll just fail it since the caller
+	// holds a reference on it).
 
 	vfs.mounts.seq.BeginWrite()
 	if opts.Flags&linux.MNT_DETACH == 0 {
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index 3b933468d..3335e4057 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -55,7 +55,7 @@ func TestMountTableInsertLookup(t *testing.T) {
 	}
 }
 
-// TODO: concurrent lookup/insertion/removal
+// TODO(gvisor.dev/issue/1035): concurrent lookup/insertion/removal.
 
 // must be powers of 2
 var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 02e5af3d3..28f0d54b9 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -272,9 +272,8 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 
 	root := spec.Root.Path
 	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		// FIXME: runsc can't be re-executed without
-		// /proc, so we create a tmpfs mount, mount ./proc and ./root
-		// there, then move this mount to the root and after
+		// runsc can't be re-executed without /proc, so we create a tmpfs mount,
+		// mount ./proc and ./root there, then move this mount to the root and after
 		// setCapsAndCallSelf, runsc will chroot into /root.
 		//
 		// We need a directory to construct a new root and we know that
diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc
index a4f8f3cec..f57d38dc7 100644
--- a/test/syscalls/linux/epoll.cc
+++ b/test/syscalls/linux/epoll.cc
@@ -56,10 +56,6 @@ TEST(EpollTest, AllWritable) {
   struct epoll_event result[kFDsPerEpoll];
   ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
               SyscallSucceedsWithValue(kFDsPerEpoll));
-  // TODO(edahlgren): Why do some tests check epoll_event::data, and others
-  // don't? Does Linux actually guarantee that, in any of these test cases,
-  // epoll_wait will necessarily write out the epoll_events in the order that
-  // they were registered?
   for (int i = 0; i < kFDsPerEpoll; i++) {
     ASSERT_EQ(result[i].events, EPOLLOUT);
   }
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 25fdd7106..fb418e052 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -87,6 +87,7 @@ class FileTest : public ::testing::Test {
     ClosePipes();
   }
 
+ protected:
   std::string test_file_name_;
   FileDescriptor test_file_fd_;
 
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index b48fe540d..c2f72e010 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -27,14 +27,7 @@ namespace testing {
 
 namespace {
 
-// This test is currently very rudimentary.
-//
-// TODO(edahlgren):
-// * bad buffer states (EFAULT).
-// * bad fds (wrong permission, wrong type of file, EBADF).
-// * check offset is not incremented.
-// * check for EOF.
-// * writing to pipes, symlinks, special files.
+// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary.
 class Pwrite64 : public ::testing::Test {
   void SetUp() override {
     name_ = NewTempAbsPath();
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index 53ad2dda3..3a8ba37eb 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -242,7 +242,7 @@ TEST_F(TuntapTest, InvalidReadWrite) {
 TEST_F(TuntapTest, WriteToDownDevice) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
 
-  // FIXME: gVisor always creates enabled/up'd interfaces.
+  // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces.
   SKIP_IF(IsRunningOnGvisor());
 
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
@@ -280,10 +280,11 @@ PosixErrorOr<FileDescriptor> OpenAndAttachTap(
                                    &addr, sizeof(addr)));
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: gVisor doesn't support setting MAC address on interfaces yet.
+    // FIXME(b/110961832): gVisor doesn't support setting MAC address on
+    // interfaces yet.
     RETURN_IF_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
 
-    // FIXME: gVisor always creates enabled/up'd interfaces.
+    // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces.
     RETURN_IF_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
   }
 
diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc
index 9b219cfd6..39b5b2f56 100644
--- a/test/syscalls/linux/write.cc
+++ b/test/syscalls/linux/write.cc
@@ -31,14 +31,8 @@ namespace gvisor {
 namespace testing {
 
 namespace {
-// This test is currently very rudimentary.
-//
-// TODO(edahlgren):
-// * bad buffer states (EFAULT).
-// * bad fds (wrong permission, wrong type of file, EBADF).
-// * check offset is incremented.
-// * check for EOF.
-// * writing to pipes, symlinks, special files.
+
+// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary.
 class WriteTest : public ::testing::Test {
  public:
   ssize_t WriteBytes(int fd, int bytes) {
diff --git a/tools/go_generics/defs.bzl b/tools/go_generics/defs.bzl
index c5be52ecd..8c9995fd4 100644
--- a/tools/go_generics/defs.bzl
+++ b/tools/go_generics/defs.bzl
@@ -105,7 +105,6 @@ def _go_template_instance_impl(ctx):
         executable = ctx.executable._tool,
     )
 
-    # TODO: How can we get the dependencies out?
     return struct(
         files = depset([output]),
     )
-- 
cgit v1.2.3


From 7297fd7238e17803e073fb5a5ef85edf992bdf6b Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 8 Apr 2020 19:40:15 -0700
Subject: Bump proc_test's kRSSTolerance to 10MB.

PiperOrigin-RevId: 305604557
---
 test/syscalls/linux/proc.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index da98e1f66..79a625ebc 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -994,7 +994,7 @@ constexpr uint64_t kMappingSize = 100 << 20;
 
 // Tolerance on RSS comparisons to account for background thread mappings,
 // reclaimed pages, newly faulted pages, etc.
-constexpr uint64_t kRSSTolerance = 5 << 20;
+constexpr uint64_t kRSSTolerance = 10 << 20;
 
 // Capture RSS before and after an anonymous mapping with passed prot.
 void MapPopulateRSS(int prot, uint64_t* before, uint64_t* after) {
-- 
cgit v1.2.3


From a10389e783aab5f530641394ef44c8a1dede9372 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 8 Apr 2020 23:02:09 -0700
Subject: splice: cap splice calls to MAX_RW_COUNT

The Linux does the same.

Reported-by: syzbot+e81716e8956e92e9d56b@syzkaller.appspotmail.com
PiperOrigin-RevId: 305625439
---
 pkg/sentry/syscalls/linux/sys_splice.go |   4 ++
 test/syscalls/linux/BUILD               |   2 +
 test/syscalls/linux/sendfile_socket.cc  | 105 ++++++++++++++------------------
 3 files changed, 53 insertions(+), 58 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index fd642834b..fbc6cf15f 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -29,6 +29,10 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
 		return 0, syserror.EINVAL
 	}
 
+	if opts.Length > int64(kernel.MAX_RW_COUNT) {
+		opts.Length = int64(kernel.MAX_RW_COUNT)
+	}
+
 	var (
 		total int64
 		n     int64
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index d0c431234..ae3017608 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2026,6 +2026,8 @@ cc_binary(
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings",
         gtest,
+        ":ip_socket_test_util",
+        ":unix_domain_socket_test_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index e94672679..c101fe9d2 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -23,6 +23,7 @@
 
 #include "gtest/gtest.h"
 #include "absl/strings/string_view.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/temp_path.h"
@@ -35,61 +36,39 @@ namespace {
 
 class SendFileTest : public ::testing::TestWithParam<int> {
  protected:
-  PosixErrorOr<std::tuple<int, int>> Sockets() {
+  PosixErrorOr<std::unique_ptr<SocketPair>> Sockets(int type) {
     // Bind a server socket.
     int family = GetParam();
-    struct sockaddr server_addr = {};
     switch (family) {
       case AF_INET: {
-        struct sockaddr_in* server_addr_in =
-            reinterpret_cast<struct sockaddr_in*>(&server_addr);
-        server_addr_in->sin_family = family;
-        server_addr_in->sin_addr.s_addr = INADDR_ANY;
-        break;
+        if (type == SOCK_STREAM) {
+          return SocketPairKind{
+              "TCP", AF_INET, type, 0,
+              TCPAcceptBindSocketPairCreator(AF_INET, type, 0, false)}
+              .Create();
+        } else {
+          return SocketPairKind{
+              "UDP", AF_INET, type, 0,
+              UDPBidirectionalBindSocketPairCreator(AF_INET, type, 0, false)}
+              .Create();
+        }
       }
       case AF_UNIX: {
-        struct sockaddr_un* server_addr_un =
-            reinterpret_cast<struct sockaddr_un*>(&server_addr);
-        server_addr_un->sun_family = family;
-        server_addr_un->sun_path[0] = '\0';
-        break;
+        if (type == SOCK_STREAM) {
+          return SocketPairKind{
+              "UNIX", AF_UNIX, type, 0,
+              FilesystemAcceptBindSocketPairCreator(AF_UNIX, type, 0)}
+              .Create();
+        } else {
+          return SocketPairKind{
+              "UNIX", AF_UNIX, type, 0,
+              FilesystemBidirectionalBindSocketPairCreator(AF_UNIX, type, 0)}
+              .Create();
+        }
       }
       default:
         return PosixError(EINVAL);
     }
-    int server = socket(family, SOCK_STREAM, 0);
-    if (bind(server, &server_addr, sizeof(server_addr)) < 0) {
-      return PosixError(errno);
-    }
-    if (listen(server, 1) < 0) {
-      close(server);
-      return PosixError(errno);
-    }
-
-    // Fetch the address; both are anonymous.
-    socklen_t length = sizeof(server_addr);
-    if (getsockname(server, &server_addr, &length) < 0) {
-      close(server);
-      return PosixError(errno);
-    }
-
-    // Connect the client.
-    int client = socket(family, SOCK_STREAM, 0);
-    if (connect(client, &server_addr, length) < 0) {
-      close(server);
-      close(client);
-      return PosixError(errno);
-    }
-
-    // Accept on the server.
-    int server_client = accept(server, nullptr, 0);
-    if (server_client < 0) {
-      close(server);
-      close(client);
-      return PosixError(errno);
-    }
-    close(server);
-    return std::make_tuple(client, server_client);
   }
 };
 
@@ -106,9 +85,7 @@ TEST_P(SendFileTest, SendMultiple) {
   const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
 
   // Create sockets.
-  std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
-  const FileDescriptor server(std::get<0>(fds));
-  FileDescriptor client(std::get<1>(fds));  // non-const, reset is used.
+  auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_STREAM));
 
   // Thread that reads data from socket and dumps to a file.
   ScopedThread th([&] {
@@ -118,7 +95,7 @@ TEST_P(SendFileTest, SendMultiple) {
     // Read until socket is closed.
     char buf[10240];
     for (int cnt = 0;; cnt++) {
-      int r = RetryEINTR(read)(server.get(), buf, sizeof(buf));
+      int r = RetryEINTR(read)(socks->first_fd(), buf, sizeof(buf));
       // We cannot afford to save on every read() call.
       if (cnt % 1000 == 0) {
         ASSERT_THAT(r, SyscallSucceeds());
@@ -152,7 +129,7 @@ TEST_P(SendFileTest, SendMultiple) {
               << ", remain=" << remain << std::endl;
 
     // Send data and verify that sendfile returns the correct value.
-    int res = sendfile(client.get(), inf.get(), nullptr, remain);
+    int res = sendfile(socks->second_fd(), inf.get(), nullptr, remain);
     // We cannot afford to save on every sendfile() call.
     if (cnt % 120 == 0) {
       MaybeSave();
@@ -169,7 +146,7 @@ TEST_P(SendFileTest, SendMultiple) {
   }
 
   // Close socket to stop thread.
-  client.reset();
+  close(socks->release_second_fd());
   th.Join();
 
   // Verify that the output file has the correct data.
@@ -183,9 +160,7 @@ TEST_P(SendFileTest, SendMultiple) {
 
 TEST_P(SendFileTest, Shutdown) {
   // Create a socket.
-  std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
-  const FileDescriptor client(std::get<0>(fds));
-  FileDescriptor server(std::get<1>(fds));  // non-const, reset below.
+  auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_STREAM));
 
   // If this is a TCP socket, then turn off linger.
   if (GetParam() == AF_INET) {
@@ -193,7 +168,7 @@ TEST_P(SendFileTest, Shutdown) {
     sl.l_onoff = 1;
     sl.l_linger = 0;
     ASSERT_THAT(
-        setsockopt(server.get(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+        setsockopt(socks->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
         SyscallSucceeds());
   }
 
@@ -212,12 +187,12 @@ TEST_P(SendFileTest, Shutdown) {
   ScopedThread t([&]() {
     size_t done = 0;
     while (done < data.size()) {
-      int n = RetryEINTR(read)(server.get(), data.data(), data.size());
+      int n = RetryEINTR(read)(socks->first_fd(), data.data(), data.size());
       ASSERT_THAT(n, SyscallSucceeds());
       done += n;
     }
     // Close the server side socket.
-    server.reset();
+    close(socks->release_first_fd());
   });
 
   // Continuously stream from the file to the socket. Note we do not assert
@@ -225,7 +200,7 @@ TEST_P(SendFileTest, Shutdown) {
   // data is written. Eventually, we should get a connection reset error.
   while (1) {
     off_t offset = 0;  // Always read from the start.
-    int n = sendfile(client.get(), inf.get(), &offset, data.size());
+    int n = sendfile(socks->second_fd(), inf.get(), &offset, data.size());
     EXPECT_THAT(n, AnyOf(SyscallFailsWithErrno(ECONNRESET),
                          SyscallFailsWithErrno(EPIPE), SyscallSucceeds()));
     if (n <= 0) {
@@ -234,6 +209,20 @@ TEST_P(SendFileTest, Shutdown) {
   }
 }
 
+TEST_P(SendFileTest, SendpageFromEmptyFileToUDP) {
+  auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_DGRAM));
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  // The value to the count argument has to be so that it is impossible to
+  // allocate a buffer of this size. In Linux, sendfile transfer at most
+  // 0x7ffff000 (MAX_RW_COUNT) bytes.
+  EXPECT_THAT(sendfile(socks->first_fd(), fd.get(), 0x0, 0x8000000000004),
+              SyscallSucceedsWithValue(0));
+}
+
 INSTANTIATE_TEST_SUITE_P(AddressFamily, SendFileTest,
                          ::testing::Values(AF_UNIX, AF_INET));
 
-- 
cgit v1.2.3


From 64c2b490671852aaa024a4bb4757eef309fadf18 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 9 Apr 2020 13:33:18 -0700
Subject: Dedup netlink utility functions in tests.

PiperOrigin-RevId: 305749697
---
 test/syscalls/linux/BUILD                        |  3 +-
 test/syscalls/linux/socket_netlink_route.cc      | 75 ++++--------------------
 test/syscalls/linux/socket_netlink_route_util.cc |  7 +--
 test/syscalls/linux/socket_netlink_route_util.h  |  4 +-
 test/syscalls/linux/tuntap.cc                    | 19 +++---
 5 files changed, 24 insertions(+), 84 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ae3017608..96ca39583 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -138,7 +138,6 @@ cc_library(
     hdrs = ["socket_netlink_route_util.h"],
     deps = [
         ":socket_netlink_util",
-        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -2804,13 +2803,13 @@ cc_binary(
     srcs = ["socket_netlink_route.cc"],
     linkstatic = 1,
     deps = [
+        ":socket_netlink_route_util",
         ":socket_netlink_util",
         ":socket_test_util",
         "//test/util:capability_util",
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:optional",
         gtest,
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 2efb96bc3..fbe61c5a0 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -26,7 +26,7 @@
 
 #include "gtest/gtest.h"
 #include "absl/strings/str_format.h"
-#include "absl/types/optional.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/capability_util.h"
@@ -118,24 +118,6 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
   // TODO(mpratt): Check ifinfomsg contents and following attrs.
 }
 
-PosixError DumpLinks(
-    const FileDescriptor& fd, uint32_t seq,
-    const std::function<void(const struct nlmsghdr* hdr)>& fn) {
-  struct request {
-    struct nlmsghdr hdr;
-    struct ifinfomsg ifm;
-  };
-
-  struct request req = {};
-  req.hdr.nlmsg_len = sizeof(req);
-  req.hdr.nlmsg_type = RTM_GETLINK;
-  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
-  req.hdr.nlmsg_seq = seq;
-  req.ifm.ifi_family = AF_UNSPEC;
-
-  return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
-}
-
 TEST(NetlinkRouteTest, GetLinkDump) {
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -161,37 +143,6 @@ TEST(NetlinkRouteTest, GetLinkDump) {
   EXPECT_TRUE(loopbackFound);
 }
 
-struct Link {
-  int index;
-  std::string name;
-};
-
-PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
-  ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
-
-  absl::optional<Link> link;
-  RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
-    if (hdr->nlmsg_type != RTM_NEWLINK ||
-        hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
-      return;
-    }
-    const struct ifinfomsg* msg =
-        reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
-    if (msg->ifi_type == ARPHRD_LOOPBACK) {
-      const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
-      if (rta == nullptr) {
-        // Ignore links that do not have a name.
-        return;
-      }
-
-      link = Link();
-      link->index = msg->ifi_index;
-      link->name = std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
-    }
-  }));
-  return link;
-}
-
 // CheckLinkMsg checks a netlink message against an expected link.
 void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
   ASSERT_THAT(hdr->nlmsg_type, Eq(RTM_NEWLINK));
@@ -209,9 +160,7 @@ void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
 }
 
 TEST(NetlinkRouteTest, GetLinkByIndex) {
-  absl::optional<Link> loopback_link =
-      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
-  ASSERT_TRUE(loopback_link.has_value());
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
 
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -227,13 +176,13 @@ TEST(NetlinkRouteTest, GetLinkByIndex) {
   req.hdr.nlmsg_flags = NLM_F_REQUEST;
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
-  req.ifm.ifi_index = loopback_link->index;
+  req.ifm.ifi_index = loopback_link.index;
 
   bool found = false;
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
       [&](const struct nlmsghdr* hdr) {
-        CheckLinkMsg(hdr, *loopback_link);
+        CheckLinkMsg(hdr, loopback_link);
         found = true;
       },
       false));
@@ -241,9 +190,7 @@ TEST(NetlinkRouteTest, GetLinkByIndex) {
 }
 
 TEST(NetlinkRouteTest, GetLinkByName) {
-  absl::optional<Link> loopback_link =
-      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
-  ASSERT_TRUE(loopback_link.has_value());
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
 
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -262,8 +209,8 @@ TEST(NetlinkRouteTest, GetLinkByName) {
   req.hdr.nlmsg_seq = kSeq;
   req.ifm.ifi_family = AF_UNSPEC;
   req.rtattr.rta_type = IFLA_IFNAME;
-  req.rtattr.rta_len = RTA_LENGTH(loopback_link->name.size() + 1);
-  strncpy(req.ifname, loopback_link->name.c_str(), sizeof(req.ifname));
+  req.rtattr.rta_len = RTA_LENGTH(loopback_link.name.size() + 1);
+  strncpy(req.ifname, loopback_link.name.c_str(), sizeof(req.ifname));
   req.hdr.nlmsg_len =
       NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
 
@@ -271,7 +218,7 @@ TEST(NetlinkRouteTest, GetLinkByName) {
   ASSERT_NO_ERRNO(NetlinkRequestResponse(
       fd, &req, sizeof(req),
       [&](const struct nlmsghdr* hdr) {
-        CheckLinkMsg(hdr, *loopback_link);
+        CheckLinkMsg(hdr, loopback_link);
         found = true;
       },
       false));
@@ -523,9 +470,7 @@ TEST(NetlinkRouteTest, LookupAll) {
 TEST(NetlinkRouteTest, AddAddr) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
 
-  absl::optional<Link> loopback_link =
-      ASSERT_NO_ERRNO_AND_VALUE(FindLoopbackLink());
-  ASSERT_TRUE(loopback_link.has_value());
+  Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
 
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
@@ -545,7 +490,7 @@ TEST(NetlinkRouteTest, AddAddr) {
   req.ifa.ifa_prefixlen = 24;
   req.ifa.ifa_flags = 0;
   req.ifa.ifa_scope = 0;
-  req.ifa.ifa_index = loopback_link->index;
+  req.ifa.ifa_index = loopback_link.index;
   req.rtattr.rta_type = IFA_LOCAL;
   req.rtattr.rta_len = RTA_LENGTH(sizeof(req.addr));
   inet_pton(AF_INET, "10.0.0.1", &req.addr);
diff --git a/test/syscalls/linux/socket_netlink_route_util.cc b/test/syscalls/linux/socket_netlink_route_util.cc
index 53eb3b6b2..bde1dbb4d 100644
--- a/test/syscalls/linux/socket_netlink_route_util.cc
+++ b/test/syscalls/linux/socket_netlink_route_util.cc
@@ -18,7 +18,6 @@
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 
-#include "absl/types/optional.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 
 namespace gvisor {
@@ -73,14 +72,14 @@ PosixErrorOr<std::vector<Link>> DumpLinks() {
   return links;
 }
 
-PosixErrorOr<absl::optional<Link>> FindLoopbackLink() {
+PosixErrorOr<Link> LoopbackLink() {
   ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
   for (const auto& link : links) {
     if (link.type == ARPHRD_LOOPBACK) {
-      return absl::optional<Link>(link);
+      return link;
     }
   }
-  return absl::optional<Link>();
+  return PosixError(ENOENT, "loopback link not found");
 }
 
 PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
diff --git a/test/syscalls/linux/socket_netlink_route_util.h b/test/syscalls/linux/socket_netlink_route_util.h
index 2c018e487..149c4a7f6 100644
--- a/test/syscalls/linux/socket_netlink_route_util.h
+++ b/test/syscalls/linux/socket_netlink_route_util.h
@@ -20,7 +20,6 @@
 
 #include <vector>
 
-#include "absl/types/optional.h"
 #include "test/syscalls/linux/socket_netlink_util.h"
 
 namespace gvisor {
@@ -37,7 +36,8 @@ PosixError DumpLinks(const FileDescriptor& fd, uint32_t seq,
 
 PosixErrorOr<std::vector<Link>> DumpLinks();
 
-PosixErrorOr<absl::optional<Link>> FindLoopbackLink();
+// Returns the loopback link on the system. ENOENT if not found.
+PosixErrorOr<Link> LoopbackLink();
 
 // LinkAddLocalAddr sets IFA_LOCAL attribute on the interface.
 PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index 3a8ba37eb..6195b11e1 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -56,14 +56,14 @@ PosixErrorOr<std::set<std::string>> DumpLinkNames() {
   return names;
 }
 
-PosixErrorOr<absl::optional<Link>> GetLinkByName(const std::string& name) {
+PosixErrorOr<Link> GetLinkByName(const std::string& name) {
   ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
   for (const auto& link : links) {
     if (link.name == name) {
-      return absl::optional<Link>(link);
+      return link;
     }
   }
-  return absl::optional<Link>();
+  return PosixError(ENOENT, "interface not found");
 }
 
 struct pihdr {
@@ -268,24 +268,21 @@ PosixErrorOr<FileDescriptor> OpenAndAttachTap(
     return PosixError(errno);
   }
 
-  ASSIGN_OR_RETURN_ERRNO(absl::optional<Link> link, GetLinkByName(dev_name));
-  if (!link.has_value()) {
-    return PosixError(ENOENT, "no link");
-  }
+  ASSIGN_OR_RETURN_ERRNO(auto link, GetLinkByName(dev_name));
 
   // Interface setup.
   struct in_addr addr;
   inet_pton(AF_INET, dev_ipv4_addr.c_str(), &addr);
-  EXPECT_NO_ERRNO(LinkAddLocalAddr(link->index, AF_INET, /*prefixlen=*/24,
-                                   &addr, sizeof(addr)));
+  EXPECT_NO_ERRNO(LinkAddLocalAddr(link.index, AF_INET, /*prefixlen=*/24, &addr,
+                                   sizeof(addr)));
 
   if (!IsRunningOnGvisor()) {
     // FIXME(b/110961832): gVisor doesn't support setting MAC address on
     // interfaces yet.
-    RETURN_IF_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA)));
+    RETURN_IF_ERRNO(LinkSetMacAddr(link.index, kMacA, sizeof(kMacA)));
 
     // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces.
-    RETURN_IF_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP));
+    RETURN_IF_ERRNO(LinkChangeFlags(link.index, IFF_UP, IFF_UP));
   }
 
   return fd;
-- 
cgit v1.2.3


From ace90f823cf33d1c1180dcd0d2061c702270a0d6 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Thu, 9 Apr 2020 16:21:02 -0700
Subject: Make some functions in IfAddrHelper const.

PiperOrigin-RevId: 305782490
---
 test/syscalls/linux/ip_socket_test_util.cc         | 10 ++---
 test/syscalls/linux/ip_socket_test_util.h          |  6 +--
 .../socket_ipv4_udp_unbound_external_networking.cc | 49 ++++++++++------------
 .../socket_ipv4_udp_unbound_external_networking.h  |  6 +--
 4 files changed, 32 insertions(+), 39 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index bba022a41..d28dc0db6 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -177,17 +177,17 @@ SocketKind IPv6TCPUnboundSocket(int type) {
 PosixError IfAddrHelper::Load() {
   Release();
   RETURN_ERROR_IF_SYSCALL_FAIL(getifaddrs(&ifaddr_));
-  return PosixError(0);
+  return NoError();
 }
 
 void IfAddrHelper::Release() {
   if (ifaddr_) {
     freeifaddrs(ifaddr_);
+    ifaddr_ = nullptr;
   }
-  ifaddr_ = nullptr;
 }
 
-std::vector<std::string> IfAddrHelper::InterfaceList(int family) {
+std::vector<std::string> IfAddrHelper::InterfaceList(int family) const {
   std::vector<std::string> names;
   for (auto ifa = ifaddr_; ifa != NULL; ifa = ifa->ifa_next) {
     if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != family) {
@@ -198,7 +198,7 @@ std::vector<std::string> IfAddrHelper::InterfaceList(int family) {
   return names;
 }
 
-sockaddr* IfAddrHelper::GetAddr(int family, std::string name) {
+const sockaddr* IfAddrHelper::GetAddr(int family, std::string name) const {
   for (auto ifa = ifaddr_; ifa != NULL; ifa = ifa->ifa_next) {
     if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != family) {
       continue;
@@ -210,7 +210,7 @@ sockaddr* IfAddrHelper::GetAddr(int family, std::string name) {
   return nullptr;
 }
 
-PosixErrorOr<int> IfAddrHelper::GetIndex(std::string name) {
+PosixErrorOr<int> IfAddrHelper::GetIndex(std::string name) const {
   return InterfaceIndex(name);
 }
 
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 39fd6709d..9c3859fcd 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -110,10 +110,10 @@ class IfAddrHelper {
   PosixError Load();
   void Release();
 
-  std::vector<std::string> InterfaceList(int family);
+  std::vector<std::string> InterfaceList(int family) const;
 
-  struct sockaddr* GetAddr(int family, std::string name);
-  PosixErrorOr<int> GetIndex(std::string name);
+  const sockaddr* GetAddr(int family, std::string name) const;
+  PosixErrorOr<int> GetIndex(std::string name) const;
 
  private:
   struct ifaddrs* ifaddr_;
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 40e673625..d690d9564 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -45,37 +45,31 @@ void IPv4UDPUnboundExternalNetworkingSocketTest::SetUp() {
   got_if_infos_ = false;
 
   // Get interface list.
-  std::vector<std::string> if_names;
   ASSERT_NO_ERRNO(if_helper_.Load());
-  if_names = if_helper_.InterfaceList(AF_INET);
+  std::vector<std::string> if_names = if_helper_.InterfaceList(AF_INET);
   if (if_names.size() != 2) {
     return;
   }
 
   // Figure out which interface is where.
-  int lo = 0, eth = 1;
-  if (if_names[lo] != "lo") {
-    lo = 1;
-    eth = 0;
-  }
-
-  if (if_names[lo] != "lo") {
-    return;
-  }
-
-  lo_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(if_names[lo]));
-  lo_if_addr_ = if_helper_.GetAddr(AF_INET, if_names[lo]);
-  if (lo_if_addr_ == nullptr) {
+  std::string lo = if_names[0];
+  std::string eth = if_names[1];
+  if (lo != "lo") std::swap(lo, eth);
+  if (lo != "lo") return;
+
+  lo_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(lo));
+  auto lo_if_addr = if_helper_.GetAddr(AF_INET, lo);
+  if (lo_if_addr == nullptr) {
     return;
   }
-  lo_if_sin_addr_ = reinterpret_cast<sockaddr_in*>(lo_if_addr_)->sin_addr;
+  lo_if_addr_ = *reinterpret_cast<const sockaddr_in*>(lo_if_addr);
 
-  eth_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(if_names[eth]));
-  eth_if_addr_ = if_helper_.GetAddr(AF_INET, if_names[eth]);
-  if (eth_if_addr_ == nullptr) {
+  eth_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(eth));
+  auto eth_if_addr = if_helper_.GetAddr(AF_INET, eth);
+  if (eth_if_addr == nullptr) {
     return;
   }
-  eth_if_sin_addr_ = reinterpret_cast<sockaddr_in*>(eth_if_addr_)->sin_addr;
+  eth_if_addr_ = *reinterpret_cast<const sockaddr_in*>(eth_if_addr);
 
   got_if_infos_ = true;
 }
@@ -242,7 +236,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
   // Bind the non-receiving socket to the unicast ethernet address.
   auto norecv_addr = rcv1_addr;
   reinterpret_cast<sockaddr_in*>(&norecv_addr.addr)->sin_addr =
-      eth_if_sin_addr_;
+      eth_if_addr_.sin_addr;
   ASSERT_THAT(bind(norcv->get(), reinterpret_cast<sockaddr*>(&norecv_addr.addr),
                    norecv_addr.addr_len),
               SyscallSucceedsWithValue(0));
@@ -1028,7 +1022,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
   auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   ip_mreqn iface = {};
   iface.imr_ifindex = lo_if_idx_;
-  iface.imr_address = eth_if_sin_addr_;
+  iface.imr_address = eth_if_addr_.sin_addr;
   ASSERT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
                          sizeof(iface)),
               SyscallSucceeds());
@@ -1058,7 +1052,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
   SKIP_IF(IsRunningOnGvisor());
 
   // Verify the received source address.
-  EXPECT_EQ(eth_if_sin_addr_.s_addr, src_addr_in->sin_addr.s_addr);
+  EXPECT_EQ(eth_if_addr_.sin_addr.s_addr, src_addr_in->sin_addr.s_addr);
 }
 
 // Check that when we are bound to one interface we can set IP_MULTICAST_IF to
@@ -1075,7 +1069,8 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
 
   // Create sender and bind to eth interface.
   auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
-  ASSERT_THAT(bind(sender->get(), eth_if_addr_, sizeof(sockaddr_in)),
+  ASSERT_THAT(bind(sender->get(), reinterpret_cast<sockaddr*>(&eth_if_addr_),
+                   sizeof(eth_if_addr_)),
               SyscallSucceeds());
 
   // Run through all possible combinations of index and address for
@@ -1085,9 +1080,9 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
     struct in_addr imr_address;
   } test_data[] = {
       {lo_if_idx_, {}},
-      {0, lo_if_sin_addr_},
-      {lo_if_idx_, lo_if_sin_addr_},
-      {lo_if_idx_, eth_if_sin_addr_},
+      {0, lo_if_addr_.sin_addr},
+      {lo_if_idx_, lo_if_addr_.sin_addr},
+      {lo_if_idx_, eth_if_addr_.sin_addr},
   };
   for (auto t : test_data) {
     ip_mreqn iface = {};
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
index bec2e96ee..10b90b1e0 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
@@ -36,10 +36,8 @@ class IPv4UDPUnboundExternalNetworkingSocketTest : public SimpleSocketTest {
   // Interface infos.
   int lo_if_idx_;
   int eth_if_idx_;
-  sockaddr* lo_if_addr_;
-  sockaddr* eth_if_addr_;
-  in_addr lo_if_sin_addr_;
-  in_addr eth_if_sin_addr_;
+  sockaddr_in lo_if_addr_;
+  sockaddr_in eth_if_addr_;
 };
 
 }  // namespace testing
-- 
cgit v1.2.3


From 9f87502b4619b60779ce19c41ea0e6bd6582e8e4 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 9 Apr 2020 16:40:12 -0700
Subject: Remove TODOs from Async IO

Block and drain requests in io_destroy(2).
Note the reason to create read-only mapping.

PiperOrigin-RevId: 305786312
---
 pkg/sentry/mm/aio_context.go         | 101 ++++++++++++++++++++++++-----------
 pkg/sentry/mm/aio_context_state.go   |   2 +-
 pkg/sentry/syscalls/linux/sys_aio.go |  34 +++++++++---
 test/syscalls/linux/aio.cc           |  12 +++--
 4 files changed, 107 insertions(+), 42 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index cb29d94b0..379148903 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -59,25 +59,27 @@ func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
 	}
 
 	a.contexts[id] = &AIOContext{
-		done:           make(chan struct{}, 1),
+		requestReady:   make(chan struct{}, 1),
 		maxOutstanding: events,
 	}
 	return true
 }
 
-// destroyAIOContext destroys an asynchronous I/O context.
+// destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for
+// for pending requests to complete. Returns the destroyed AIOContext so it can
+// be drained.
 //
-// False is returned if the context does not exist.
-func (a *aioManager) destroyAIOContext(id uint64) bool {
+// Nil is returned if the context does not exist.
+func (a *aioManager) destroyAIOContext(id uint64) *AIOContext {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 	ctx, ok := a.contexts[id]
 	if !ok {
-		return false
+		return nil
 	}
 	delete(a.contexts, id)
 	ctx.destroy()
-	return true
+	return ctx
 }
 
 // lookupAIOContext looks up the given context.
@@ -102,8 +104,8 @@ type ioResult struct {
 //
 // +stateify savable
 type AIOContext struct {
-	// done is the notification channel used for all requests.
-	done chan struct{} `state:"nosave"`
+	// requestReady is the notification channel used for all requests.
+	requestReady chan struct{} `state:"nosave"`
 
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
@@ -129,8 +131,14 @@ func (ctx *AIOContext) destroy() {
 	ctx.mu.Lock()
 	defer ctx.mu.Unlock()
 	ctx.dead = true
-	if ctx.outstanding == 0 {
-		close(ctx.done)
+	ctx.checkForDone()
+}
+
+// Preconditions: ctx.mu must be held by caller.
+func (ctx *AIOContext) checkForDone() {
+	if ctx.dead && ctx.outstanding == 0 {
+		close(ctx.requestReady)
+		ctx.requestReady = nil
 	}
 }
 
@@ -154,11 +162,12 @@ func (ctx *AIOContext) PopRequest() (interface{}, bool) {
 
 	// Is there anything ready?
 	if e := ctx.results.Front(); e != nil {
-		ctx.results.Remove(e)
-		ctx.outstanding--
-		if ctx.outstanding == 0 && ctx.dead {
-			close(ctx.done)
+		if ctx.outstanding == 0 {
+			panic("AIOContext outstanding is going negative")
 		}
+		ctx.outstanding--
+		ctx.results.Remove(e)
+		ctx.checkForDone()
 		return e.data, true
 	}
 	return nil, false
@@ -172,26 +181,58 @@ func (ctx *AIOContext) FinishRequest(data interface{}) {
 
 	// Push to the list and notify opportunistically. The channel notify
 	// here is guaranteed to be safe because outstanding must be non-zero.
-	// The done channel is only closed when outstanding reaches zero.
+	// The requestReady channel is only closed when outstanding reaches zero.
 	ctx.results.PushBack(&ioResult{data: data})
 
 	select {
-	case ctx.done <- struct{}{}:
+	case ctx.requestReady <- struct{}{}:
 	default:
 	}
 }
 
 // WaitChannel returns a channel that is notified when an AIO request is
-// completed.
-//
-// The boolean return value indicates whether or not the context is active.
-func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
+// completed. Returns nil if the context is destroyed and there are no more
+// outstanding requests.
+func (ctx *AIOContext) WaitChannel() chan struct{} {
 	ctx.mu.Lock()
 	defer ctx.mu.Unlock()
-	if ctx.outstanding == 0 && ctx.dead {
-		return nil, false
+	return ctx.requestReady
+}
+
+// Dead returns true if the context has been destroyed.
+func (ctx *AIOContext) Dead() bool {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	return ctx.dead
+}
+
+// CancelPendingRequest forgets about a request that hasn't yet completed.
+func (ctx *AIOContext) CancelPendingRequest() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	if ctx.outstanding == 0 {
+		panic("AIOContext outstanding is going negative")
 	}
-	return ctx.done, true
+	ctx.outstanding--
+	ctx.checkForDone()
+}
+
+// Drain drops all completed requests. Pending requests remain untouched.
+func (ctx *AIOContext) Drain() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	if ctx.outstanding == 0 {
+		return
+	}
+	size := uint32(ctx.results.Len())
+	if ctx.outstanding < size {
+		panic("AIOContext outstanding is going negative")
+	}
+	ctx.outstanding -= size
+	ctx.results.Reset()
+	ctx.checkForDone()
 }
 
 // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
@@ -332,9 +373,9 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 		Length:          aioRingBufferSize,
 		MappingIdentity: m,
 		Mappable:        m,
-		// TODO(fvoznika): Linux does "do_mmap_pgoff(..., PROT_READ |
-		// PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this
-		// mapping read-only?
+		// Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in
+		// fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC,
+		// user mode should not write to this page.
 		Perms:    usermem.Read,
 		MaxPerms: usermem.Read,
 	})
@@ -349,11 +390,11 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 	return id, nil
 }
 
-// DestroyAIOContext destroys an asynchronous I/O context. It returns false if
-// the context does not exist.
-func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) bool {
+// DestroyAIOContext destroys an asynchronous I/O context. It returns the
+// destroyed context. nil if the context does not exist.
+func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext {
 	if _, ok := mm.LookupAIOContext(ctx, id); !ok {
-		return false
+		return nil
 	}
 
 	// Only unmaps after it assured that the address is a valid aio context to
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
index c37fc9f7b..3dabac1af 100644
--- a/pkg/sentry/mm/aio_context_state.go
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -16,5 +16,5 @@ package mm
 
 // afterLoad is invoked by stateify.
 func (a *AIOContext) afterLoad() {
-	a.done = make(chan struct{}, 1)
+	a.requestReady = make(chan struct{}, 1)
 }
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index b401978db..38cbeba5a 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -114,14 +114,28 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	id := args[0].Uint64()
 
-	// Destroy the given context.
-	if !t.MemoryManager().DestroyAIOContext(t, id) {
+	ctx := t.MemoryManager().DestroyAIOContext(t, id)
+	if ctx == nil {
 		// Does not exist.
 		return 0, nil, syserror.EINVAL
 	}
-	// FIXME(fvoznika): Linux blocks until all AIO to the destroyed context is
-	// done.
-	return 0, nil, nil
+
+	// Drain completed requests amd wait for pending requests until there are no
+	// more.
+	for {
+		ctx.Drain()
+
+		ch := ctx.WaitChannel()
+		if ch == nil {
+			// No more requests, we're done.
+			return 0, nil, nil
+		}
+		// The task cannot be interrupted during the wait. Equivalent to
+		// TASK_UNINTERRUPTIBLE in Linux.
+		t.UninterruptibleSleepStart(true /* deactivate */)
+		<-ch
+		t.UninterruptibleSleepFinish(true /* activate */)
+	}
 }
 
 // IoGetevents implements linux syscall io_getevents(2).
@@ -200,13 +214,13 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (interface{}, error) {
 	for {
 		if v, ok := ctx.PopRequest(); ok {
-			// Request was readly available. Just return it.
+			// Request was readily available. Just return it.
 			return v, nil
 		}
 
 		// Need to wait for request completion.
-		done, active := ctx.WaitChannel()
-		if !active {
+		done := ctx.WaitChannel()
+		if done == nil {
 			// Context has been destroyed.
 			return nil, syserror.EINVAL
 		}
@@ -248,6 +262,10 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
 }
 
 func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) {
+	if ctx.Dead() {
+		ctx.CancelPendingRequest()
+		return
+	}
 	ev := &ioEvent{
 		Data: cb.Data,
 		Obj:  uint64(cbAddr),
diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index a33daff17..806d5729e 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -89,6 +89,7 @@ class AIOTest : public FileTest {
     FileTest::TearDown();
     if (ctx_ != 0) {
       ASSERT_THAT(DestroyContext(), SyscallSucceeds());
+      ctx_ = 0;
     }
   }
 
@@ -188,14 +189,19 @@ TEST_F(AIOTest, BadWrite) {
 }
 
 TEST_F(AIOTest, ExitWithPendingIo) {
-  // Setup a context that is 5 entries deep.
-  ASSERT_THAT(SetupContext(5), SyscallSucceeds());
+  // Setup a context that is 100 entries deep.
+  ASSERT_THAT(SetupContext(100), SyscallSucceeds());
 
   struct iocb cb = CreateCallback();
   struct iocb* cbs[] = {&cb};
 
   // Submit a request but don't complete it to make it pending.
-  EXPECT_THAT(Submit(1, cbs), SyscallSucceeds());
+  for (int i = 0; i < 100; ++i) {
+    EXPECT_THAT(Submit(1, cbs), SyscallSucceeds());
+  }
+
+  ASSERT_THAT(DestroyContext(), SyscallSucceeds());
+  ctx_ = 0;
 }
 
 int Submitter(void* arg) {
-- 
cgit v1.2.3


From 35e6b6bf1aeb909a12fb80cc99d5695408a9eaa5 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Tue, 17 Mar 2020 06:59:54 +0000
Subject: Enable syscall fork_test on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I033692bcf4f8139df29e369a12b150d10fccbe32
---
 test/syscalls/linux/fork.cc | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index ff8bdfeb0..853f6231a 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -431,7 +431,6 @@ TEST(CloneTest, NewUserNamespacePermitsAllOtherNamespaces) {
       << "status = " << status;
 }
 
-#ifdef __x86_64__
 // Clone with CLONE_SETTLS and a non-canonical TLS address is rejected.
 TEST(CloneTest, NonCanonicalTLS) {
   constexpr uintptr_t kNonCanonical = 1ull << 48;
@@ -440,11 +439,25 @@ TEST(CloneTest, NonCanonicalTLS) {
   // on this.
   char stack;
 
+  // The raw system call interface on x86-64 is:
+  // long clone(unsigned long flags, void *stack,
+  //            int *parent_tid, int *child_tid,
+  //            unsigned long tls);
+  //
+  // While on arm64, the order of the last two arguments is reversed:
+  // long clone(unsigned long flags, void *stack,
+  //            int *parent_tid, unsigned long tls,
+  //            int *child_tid);
+#if defined(__x86_64__)
   EXPECT_THAT(syscall(__NR_clone, SIGCHLD | CLONE_SETTLS, &stack, nullptr,
                       nullptr, kNonCanonical),
               SyscallFailsWithErrno(EPERM));
-}
+#elif defined(__aarch64__)
+  EXPECT_THAT(syscall(__NR_clone, SIGCHLD | CLONE_SETTLS, &stack, nullptr,
+                      kNonCanonical, nullptr),
+              SyscallFailsWithErrno(EPERM));
 #endif
+}
 
 }  // namespace
 }  // namespace testing
-- 
cgit v1.2.3


From 7aa5caae71c29b0be9047a7c156a9daaa435ebb8 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 11 Mar 2020 03:21:34 +0000
Subject: Enable syscall ptrace test on arm64.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I5bb8fa7d580d173b1438d6465e1adb442216c8fa
---
 pkg/sentry/arch/arch.go           |  3 +++
 pkg/sentry/arch/syscalls_amd64.go |  7 +++++++
 pkg/sentry/arch/syscalls_arm64.go | 13 ++++++++++++-
 pkg/sentry/kernel/task_syscall.go | 14 ++++++++++++++
 test/syscalls/linux/ptrace.cc     | 31 ++++++++++++++++++++++++-------
 5 files changed, 60 insertions(+), 8 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 1d11cc472..a903d031c 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -88,6 +88,9 @@ type Context interface {
 	// SyscallNo returns the syscall number.
 	SyscallNo() uintptr
 
+	// SyscallSaveOrig save orignal register value.
+	SyscallSaveOrig()
+
 	// SyscallArgs returns the syscall arguments in an array.
 	SyscallArgs() SyscallArguments
 
diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go
index 8b4f23007..3859f41ee 100644
--- a/pkg/sentry/arch/syscalls_amd64.go
+++ b/pkg/sentry/arch/syscalls_amd64.go
@@ -18,6 +18,13 @@ package arch
 
 const restartSyscallNr = uintptr(219)
 
+// SyscallSaveOrig save the value of the register which is clobbered in
+// syscall handler(doSyscall()).
+//
+// Noop on x86.
+func (c *context64) SyscallSaveOrig() {
+}
+
 // SyscallNo returns the syscall number according to the 64-bit convention.
 func (c *context64) SyscallNo() uintptr {
 	return uintptr(c.Regs.Orig_rax)
diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go
index dc13b6124..92d062513 100644
--- a/pkg/sentry/arch/syscalls_arm64.go
+++ b/pkg/sentry/arch/syscalls_arm64.go
@@ -18,6 +18,17 @@ package arch
 
 const restartSyscallNr = uintptr(128)
 
+// SyscallSaveOrig save the value of the register R0 which is clobbered in
+// syscall handler(doSyscall()).
+//
+// In linux, at the entry of the syscall handler(el0_svc_common()), value of R0
+// is saved to the pt_regs.orig_x0 in kernel code. But currently, the orig_x0
+// was not accessible to the user space application, so we have to do the same
+// operation in the sentry code to save the R0 value into the App context.
+func (c *context64) SyscallSaveOrig() {
+	c.OrigR0 = c.Regs.Regs[0]
+}
+
 // SyscallNo returns the syscall number according to the 64-bit convention.
 func (c *context64) SyscallNo() uintptr {
 	return uintptr(c.Regs.Regs[8])
@@ -40,7 +51,7 @@ func (c *context64) SyscallNo() uintptr {
 // R30: the link register.
 func (c *context64) SyscallArgs() SyscallArguments {
 	return SyscallArguments{
-		SyscallArgument{Value: uintptr(c.Regs.Regs[0])},
+		SyscallArgument{Value: uintptr(c.OrigR0)},
 		SyscallArgument{Value: uintptr(c.Regs.Regs[1])},
 		SyscallArgument{Value: uintptr(c.Regs.Regs[2])},
 		SyscallArgument{Value: uintptr(c.Regs.Regs[3])},
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index d555d69a8..3d7a734ef 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -194,6 +194,19 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
 //
 // The syscall path is very hot; avoid defer.
 func (t *Task) doSyscall() taskRunState {
+	// Save value of the register which is clobbered in the following
+	// t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64.
+	//
+	// On x86, register rax was shared by syscall number and return
+	// value, and at the entry of the syscall handler, the rax was
+	// saved to regs.orig_rax which was exposed to user space.
+	// But on arm64, syscall number was passed through X8, and the X0
+	// was shared by the first syscall argument and return value. The
+	// X0 was saved to regs.orig_x0 which was not exposed to user space.
+	// So we have to do the same operation here to save the X0 value
+	// into the task context.
+	t.Arch().SyscallSaveOrig()
+
 	sysno := t.Arch().SyscallNo()
 	args := t.Arch().SyscallArgs()
 
@@ -269,6 +282,7 @@ func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
 		return (*runSyscallExit)(nil)
 	}
 	args := t.Arch().SyscallArgs()
+
 	return t.doSyscallInvoke(sysno, args)
 }
 
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index cb828ff88..926690eb8 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -400,9 +400,11 @@ TEST(PtraceTest, GetRegSet) {
   // Read exactly the full register set.
   EXPECT_EQ(iov.iov_len, sizeof(regs));
 
-#ifdef __x86_64__
+#if defined(__x86_64__)
   // Child called kill(2), with SIGSTOP as arg 2.
   EXPECT_EQ(regs.rsi, SIGSTOP);
+#elif defined(__aarch64__)
+  EXPECT_EQ(regs.regs[1], SIGSTOP);
 #endif
 
   // Suppress SIGSTOP and resume the child.
@@ -752,15 +754,23 @@ TEST(PtraceTest,
               SyscallSucceeds());
   EXPECT_TRUE(siginfo.si_code == SIGTRAP || siginfo.si_code == (SIGTRAP | 0x80))
       << "si_code = " << siginfo.si_code;
-#ifdef __x86_64__
+
   {
     struct user_regs_struct regs = {};
-    ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+    struct iovec iov;
+    iov.iov_base = &regs;
+    iov.iov_len = sizeof(regs);
+    EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
+#if defined(__x86_64__)
     EXPECT_TRUE(regs.orig_rax == SYS_vfork || regs.orig_rax == SYS_clone)
         << "orig_rax = " << regs.orig_rax;
     EXPECT_EQ(grandchild_pid, regs.rax);
-  }
+#elif defined(__aarch64__)
+    EXPECT_TRUE(regs.regs[8] == SYS_clone) << "regs[8] = " << regs.regs[8];
+    EXPECT_EQ(grandchild_pid, regs.regs[0]);
 #endif  // defined(__x86_64__)
+  }
 
   // After this point, the child will be making wait4 syscalls that will be
   // interrupted by saving, so saving is not permitted. Note that this is
@@ -805,14 +815,21 @@ TEST(PtraceTest,
               SyscallSucceedsWithValue(child_pid));
   EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80))
       << " status " << status;
-#ifdef __x86_64__
   {
     struct user_regs_struct regs = {};
-    ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+    struct iovec iov;
+    iov.iov_base = &regs;
+    iov.iov_len = sizeof(regs);
+    EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov),
+                SyscallSucceeds());
+#if defined(__x86_64__)
     EXPECT_EQ(SYS_wait4, regs.orig_rax);
     EXPECT_EQ(grandchild_pid, regs.rax);
-  }
+#elif defined(__aarch64__)
+    EXPECT_EQ(SYS_wait4, regs.regs[8]);
+    EXPECT_EQ(grandchild_pid, regs.regs[0]);
 #endif  // defined(__x86_64__)
+  }
 
   // Detach from the child and wait for it to exit.
   ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
-- 
cgit v1.2.3


From 935007937cee1e2867cc4fc5c00b7f370864e241 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 10 Apr 2020 07:13:16 -0700
Subject: test: remove 1s delay after non-blocking socket pair accept

It was added in cl/201419897 to deflake
socket_ip_tcp_loopback_non_blocking_test_gvisor.
It seems we don't need this hack, because the origin issue isn't
reproducible without this hack.

PiperOrigin-RevId: 305871748
---
 test/syscalls/linux/socket_test_util.cc | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 5d3a39868..53b678e94 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -364,11 +364,6 @@ CreateTCPConnectAcceptSocketPair(int bound, int connected, int type,
   }
   MaybeSave();  // Successful accept.
 
-  // FIXME(b/110484944)
-  if (connect_result == -1) {
-    absl::SleepFor(absl::Seconds(1));
-  }
-
   T extra_addr = {};
   LocalhostAddr(&extra_addr, dual_stack);
   return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
-- 
cgit v1.2.3


From 09ddb5a4262c39744643b612109dd12dcce176a8 Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Fri, 10 Apr 2020 19:01:39 -0700
Subject: Port extended attributes to VFS2.

As in VFS1, we only support the user.* namespace. Plumbing is added to tmpfs
and goferfs.
Note that because of the slightly different order of checks between VFS2 and
Linux, one of the xattr tests needs to be relaxed slightly.

Fixes #2363.

PiperOrigin-RevId: 305985121
---
 pkg/sentry/fsimpl/ext/filesystem.go          |   4 +-
 pkg/sentry/fsimpl/gofer/filesystem.go        |  12 ++--
 pkg/sentry/fsimpl/gofer/gofer.go             |  58 +++++++++++----
 pkg/sentry/fsimpl/gofer/p9file.go            |  14 ++++
 pkg/sentry/fsimpl/kernfs/filesystem.go       |   4 +-
 pkg/sentry/fsimpl/tmpfs/BUILD                |   1 +
 pkg/sentry/fsimpl/tmpfs/filesystem.go        |  24 +++----
 pkg/sentry/fsimpl/tmpfs/tmpfs.go             |  77 ++++++++++++++++++++
 pkg/sentry/syscalls/linux/vfs2/xattr.go      |  13 ++--
 pkg/sentry/vfs/anonfs.go                     |   4 +-
 pkg/sentry/vfs/file_description.go           |  32 ++++++---
 pkg/sentry/vfs/file_description_impl_util.go |   4 +-
 pkg/sentry/vfs/filesystem.go                 |  24 ++++++-
 pkg/sentry/vfs/memxattr/BUILD                |  15 ++++
 pkg/sentry/vfs/memxattr/xattr.go             | 102 +++++++++++++++++++++++++++
 pkg/sentry/vfs/options.go                    |  14 ++++
 pkg/sentry/vfs/vfs.go                        |   8 +--
 test/syscalls/linux/xattr.cc                 |   8 +--
 18 files changed, 350 insertions(+), 68 deletions(-)
 create mode 100644 pkg/sentry/vfs/memxattr/BUILD
 create mode 100644 pkg/sentry/vfs/memxattr/xattr.go

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 48eaccdbc..afea58f65 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -476,7 +476,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	_, _, err := fs.walk(rp, false)
 	if err != nil {
 		return nil, err
@@ -485,7 +485,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	_, _, err := fs.walk(rp, false)
 	if err != nil {
 		return "", err
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 137260898..cd744bf5e 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -1080,7 +1080,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
@@ -1088,11 +1088,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 	if err != nil {
 		return nil, err
 	}
-	return d.listxattr(ctx)
+	return d.listxattr(ctx, rp.Credentials(), size)
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(&ds)
@@ -1100,7 +1100,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, nam
 	if err != nil {
 		return "", err
 	}
-	return d.getxattr(ctx, name)
+	return d.getxattr(ctx, rp.Credentials(), &opts)
 }
 
 // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
@@ -1112,7 +1112,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	if err != nil {
 		return err
 	}
-	return d.setxattr(ctx, &opts)
+	return d.setxattr(ctx, rp.Credentials(), &opts)
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
@@ -1124,7 +1124,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 	if err != nil {
 		return err
 	}
-	return d.removexattr(ctx, name)
+	return d.removexattr(ctx, rp.Credentials(), name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index bdf11fa65..2485cdb53 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -34,6 +34,7 @@ package gofer
 import (
 	"fmt"
 	"strconv"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -1024,21 +1025,50 @@ func (d *dentry) setDeleted() {
 	atomic.StoreUint32(&d.deleted, 1)
 }
 
-func (d *dentry) listxattr(ctx context.Context) ([]string, error) {
-	return nil, syserror.ENOTSUP
+// We only support xattrs prefixed with "user." (see b/148380782). Currently,
+// there is no need to expose any other xattrs through a gofer.
+func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+	xattrMap, err := d.file.listXattr(ctx, size)
+	if err != nil {
+		return nil, err
+	}
+	xattrs := make([]string, 0, len(xattrMap))
+	for x := range xattrMap {
+		if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+			xattrs = append(xattrs, x)
+		}
+	}
+	return xattrs, nil
 }
 
-func (d *dentry) getxattr(ctx context.Context, name string) (string, error) {
-	// TODO(jamieliu): add vfs.GetxattrOptions.Size
-	return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX)
+func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+		return "", err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+	return d.file.getXattr(ctx, opts.Name, opts.Size)
 }
 
-func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error {
+func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
 	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
 }
 
-func (d *dentry) removexattr(ctx context.Context, name string) error {
-	return syserror.ENOTSUP
+func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	return d.file.removeXattr(ctx, name)
 }
 
 // Preconditions: d.isRegularFile() || d.isDirectory().
@@ -1189,21 +1219,21 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 }
 
 // Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) {
-	return fd.dentry().listxattr(ctx)
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
 }
 
 // Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) {
-	return fd.dentry().getxattr(ctx, name)
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+	return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
 // Setxattr implements vfs.FileDescriptionImpl.Setxattr.
 func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
-	return fd.dentry().setxattr(ctx, &opts)
+	return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
 // Removexattr implements vfs.FileDescriptionImpl.Removexattr.
 func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
-	return fd.dentry().removexattr(ctx, name)
+	return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name)
 }
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 755ac2985..87f0b877f 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -85,6 +85,13 @@ func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAt
 	return err
 }
 
+func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
+	ctx.UninterruptibleSleepStart(false)
+	xattrs, err := f.file.ListXattr(size)
+	ctx.UninterruptibleSleepFinish(false)
+	return xattrs, err
+}
+
 func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) {
 	ctx.UninterruptibleSleepStart(false)
 	val, err := f.file.GetXattr(name, size)
@@ -99,6 +106,13 @@ func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32)
 	return err
 }
 
+func (f p9file) removeXattr(ctx context.Context, name string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.RemoveXattr(name)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
 	ctx.UninterruptibleSleepStart(false)
 	err := f.file.Allocate(mode, offset, length)
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 4433071aa..baf81b4db 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -763,7 +763,7 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
@@ -776,7 +776,7 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	fs.mu.RLock()
 	_, _, err := fs.walkExistingLocked(ctx, rp)
 	fs.mu.RUnlock()
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index f2ac23c88..4e6cd3491 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -51,6 +51,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
         "//pkg/sentry/vfs/lock",
+        "//pkg/sentry/vfs/memxattr",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 5339d7072..f4d50d64f 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -696,51 +696,47 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 }
 
 // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return nil, err
 	}
-	// TODO(b/127675828): support extended attributes
-	return nil, syserror.ENOTSUP
+	return d.inode.listxattr(size)
 }
 
 // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return "", err
 	}
-	// TODO(b/127675828): support extended attributes
-	return "", syserror.ENOTSUP
+	return d.inode.getxattr(rp.Credentials(), &opts)
 }
 
 // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
 func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
+	return d.inode.setxattr(rp.Credentials(), &opts)
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
 func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
+	return d.inode.removexattr(rp.Credentials(), name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 654e788e3..9fa8637d5 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -27,6 +27,7 @@ package tmpfs
 import (
 	"fmt"
 	"math"
+	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -37,6 +38,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -186,6 +188,11 @@ type inode struct {
 	// filesystem.RmdirAt() drops the reference.
 	refs int64
 
+	// xattrs implements extended attributes.
+	//
+	// TODO(b/148380782): Support xattrs other than user.*
+	xattrs memxattr.SimpleExtendedAttributes
+
 	// Inode metadata. Writing multiple fields atomically requires holding
 	// mu, othewise atomic operations can be used.
 	mu    sync.Mutex
@@ -535,6 +542,56 @@ func (i *inode) touchCMtimeLocked() {
 	atomic.StoreInt64(&i.ctime, now)
 }
 
+func (i *inode) listxattr(size uint64) ([]string, error) {
+	return i.xattrs.Listxattr(size)
+}
+
+func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+		return "", err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return "", syserror.ENODATA
+	}
+	return i.xattrs.Getxattr(opts)
+}
+
+func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return i.xattrs.Setxattr(opts)
+}
+
+func (i *inode) removexattr(creds *auth.Credentials, name string) error {
+	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return i.xattrs.Removexattr(name)
+}
+
+// Extended attributes in the user.* namespace are only supported for regular
+// files and directories.
+func (i *inode) userXattrSupported() bool {
+	filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
+	return filetype == linux.S_IFREG || filetype == linux.S_IFDIR
+}
+
 // fileDescription is embedded by tmpfs implementations of
 // vfs.FileDescriptionImpl.
 type fileDescription struct {
@@ -562,3 +619,23 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	creds := auth.CredentialsFromContext(ctx)
 	return fd.inode().setStat(ctx, creds, &opts.Stat)
 }
+
+// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.inode().listxattr(size)
+}
+
+// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+	return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
+func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+	return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
+func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+	return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go
index 89e9ff4d7..af455d5c1 100644
--- a/pkg/sentry/syscalls/linux/vfs2/xattr.go
+++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go
@@ -51,7 +51,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml
 	}
 	defer tpop.Release()
 
-	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop)
+	names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -74,7 +74,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 	defer file.DecRef()
 
-	names, err := file.Listxattr(t)
+	names, err := file.Listxattr(t, uint64(size))
 	if err != nil {
 		return 0, nil, err
 	}
@@ -116,7 +116,10 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli
 		return 0, nil, err
 	}
 
-	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, name)
+	value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{
+		Name: name,
+		Size: uint64(size),
+	})
 	if err != nil {
 		return 0, nil, err
 	}
@@ -145,7 +148,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	value, err := file.Getxattr(t, name)
+	value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)})
 	if err != nil {
 		return 0, nil, err
 	}
@@ -230,7 +233,7 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, err
 	}
 
-	return 0, nil, file.Setxattr(t, vfs.SetxattrOptions{
+	return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{
 		Name:  name,
 		Value: value,
 		Flags: uint32(flags),
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index d1f6dfb45..a64d86122 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -245,7 +245,7 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath
 }
 
 // ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
 	if !rp.Done() {
 		return nil, syserror.ENOTDIR
 	}
@@ -253,7 +253,7 @@ func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([
 }
 
 // GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
 	if !rp.Done() {
 		return "", syserror.ENOTDIR
 	}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 20c545fca..4fb9aea87 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -401,11 +401,11 @@ type FileDescriptionImpl interface {
 	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
 
 	// Listxattr returns all extended attribute names for the file.
-	Listxattr(ctx context.Context) ([]string, error)
+	Listxattr(ctx context.Context, size uint64) ([]string, error)
 
 	// Getxattr returns the value associated with the given extended attribute
 	// for the file.
-	Getxattr(ctx context.Context, name string) (string, error)
+	Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
 
 	// Setxattr changes the value associated with the given extended attribute
 	// for the file.
@@ -605,18 +605,23 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 
 // Listxattr returns all extended attribute names for the file represented by
 // fd.
-func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+//
+// If the size of the list (including a NUL terminating byte after every entry)
+// would exceed size, ERANGE may be returned. Note that implementations
+// are free to ignore size entirely and return without error). In all cases,
+// if size is 0, the list should be returned without error, regardless of size.
+func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp)
+		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
 		vfsObj.putResolvingPath(rp)
 		return names, err
 	}
-	names, err := fd.impl.Listxattr(ctx)
+	names, err := fd.impl.Listxattr(ctx, size)
 	if err == syserror.ENOTSUP {
 		// Linux doesn't actually return ENOTSUP in this case; instead,
 		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
@@ -629,34 +634,39 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
 
 // Getxattr returns the value associated with the given extended attribute for
 // the file represented by fd.
-func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+//
+// If the size of the return value exceeds opts.Size, ERANGE may be returned
+// (note that implementations are free to ignore opts.Size entirely and return
+// without error). In all cases, if opts.Size is 0, the value should be
+// returned without error, regardless of size.
+func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(rp)
 		return val, err
 	}
-	return fd.impl.Getxattr(ctx, name)
+	return fd.impl.Getxattr(ctx, *opts)
 }
 
 // Setxattr changes the value associated with the given extended attribute for
 // the file represented by fd.
-func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
 	if fd.opts.UseDentryMetadata {
 		vfsObj := fd.vd.mount.vfs
 		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
 			Root:  fd.vd,
 			Start: fd.vd,
 		})
-		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts)
+		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
 		vfsObj.putResolvingPath(rp)
 		return err
 	}
-	return fd.impl.Setxattr(ctx, opts)
+	return fd.impl.Setxattr(ctx, *opts)
 }
 
 // Removexattr removes the given extended attribute from the file represented
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index d45e602ce..f4c111926 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -130,14 +130,14 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
 
 // Listxattr implements FileDescriptionImpl.Listxattr analogously to
 // inode_operations::listxattr == NULL in Linux.
-func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) {
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
 	// This isn't exactly accurate; see FileDescription.Listxattr.
 	return nil, syserror.ENOTSUP
 }
 
 // Getxattr implements FileDescriptionImpl.Getxattr analogously to
 // inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) {
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
 	return "", syserror.ENOTSUP
 }
 
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index bef1bd312..a537a29d1 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -442,7 +442,13 @@ type FilesystemImpl interface {
 	// - If extended attributes are not supported by the filesystem,
 	// ListxattrAt returns nil. (See FileDescription.Listxattr for an
 	// explanation.)
-	ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
+	//
+	// - If the size of the list (including a NUL terminating byte after every
+	// entry) would exceed size, ERANGE may be returned. Note that
+	// implementations are free to ignore size entirely and return without
+	// error). In all cases, if size is 0, the list should be returned without
+	// error, regardless of size.
+	ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
 
 	// GetxattrAt returns the value associated with the given extended
 	// attribute for the file at rp.
@@ -451,7 +457,15 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem, GetxattrAt
 	// returns ENOTSUP.
-	GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
+	//
+	// - If an extended attribute named opts.Name does not exist, ENODATA is
+	// returned.
+	//
+	// - If the size of the return value exceeds opts.Size, ERANGE may be
+	// returned (note that implementations are free to ignore opts.Size entirely
+	// and return without error). In all cases, if opts.Size is 0, the value
+	// should be returned without error, regardless of size.
+	GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
 
 	// SetxattrAt changes the value associated with the given extended
 	// attribute for the file at rp.
@@ -460,6 +474,10 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem, SetxattrAt
 	// returns ENOTSUP.
+	//
+	// - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
+	// EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
+	// ENODATA is returned.
 	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
 
 	// RemovexattrAt removes the given extended attribute from the file at rp.
@@ -468,6 +486,8 @@ type FilesystemImpl interface {
 	//
 	// - If extended attributes are not supported by the filesystem,
 	// RemovexattrAt returns ENOTSUP.
+	//
+	// - If name does not exist, ENODATA is returned.
 	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
 
 	// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD
new file mode 100644
index 000000000..d8c4d27b9
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "memxattr",
+    srcs = ["xattr.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
new file mode 100644
index 000000000..cc1e7d764
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memxattr provides a default, in-memory extended attribute
+// implementation.
+package memxattr
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SimpleExtendedAttributes implements extended attributes using a map of
+// names to values.
+//
+// +stateify savable
+type SimpleExtendedAttributes struct {
+	// mu protects the below fields.
+	mu     sync.RWMutex `state:"nosave"`
+	xattrs map[string]string
+}
+
+// Getxattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+	x.mu.RLock()
+	value, ok := x.xattrs[opts.Name]
+	x.mu.RUnlock()
+	if !ok {
+		return "", syserror.ENODATA
+	}
+	// Check that the size of the buffer provided in getxattr(2) is large enough
+	// to contain the value.
+	if opts.Size != 0 && uint64(len(value)) > opts.Size {
+		return "", syserror.ERANGE
+	}
+	return value, nil
+}
+
+// Setxattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+	x.mu.Lock()
+	defer x.mu.Unlock()
+	if x.xattrs == nil {
+		if opts.Flags&linux.XATTR_REPLACE != 0 {
+			return syserror.ENODATA
+		}
+		x.xattrs = make(map[string]string)
+	}
+
+	_, ok := x.xattrs[opts.Name]
+	if ok && opts.Flags&linux.XATTR_CREATE != 0 {
+		return syserror.EEXIST
+	}
+	if !ok && opts.Flags&linux.XATTR_REPLACE != 0 {
+		return syserror.ENODATA
+	}
+
+	x.xattrs[opts.Name] = opts.Value
+	return nil
+}
+
+// Listxattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+	// Keep track of the size of the buffer needed in listxattr(2) for the list.
+	listSize := 0
+	x.mu.RLock()
+	names := make([]string, 0, len(x.xattrs))
+	for n := range x.xattrs {
+		names = append(names, n)
+		// Add one byte per null terminator.
+		listSize += len(n) + 1
+	}
+	x.mu.RUnlock()
+	if size != 0 && uint64(listSize) > size {
+		return nil, syserror.ERANGE
+	}
+	return names, nil
+}
+
+// Removexattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+	x.mu.Lock()
+	defer x.mu.Unlock()
+	if _, ok := x.xattrs[name]; !ok {
+		return syserror.ENODATA
+	}
+	delete(x.xattrs, name)
+	return nil
+}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 2f04bf882..534528ce6 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -132,6 +132,20 @@ type SetStatOptions struct {
 	Stat linux.Statx
 }
 
+// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
+// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
+// FileDescriptionImpl.Getxattr().
+type GetxattrOptions struct {
+	// Name is the name of the extended attribute to retrieve.
+	Name string
+
+	// Size is the maximum value size that the caller will tolerate. If the value
+	// is larger than size, getxattr methods may return ERANGE, but they are also
+	// free to ignore the hint entirely (i.e. the value returned may be larger
+	// than size). All size checking is done independently at the syscall layer.
+	Size uint64
+}
+
 // SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
 // FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
 // FileDescriptionImpl.Setxattr().
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 720b90d8f..f592913d5 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -680,10 +680,10 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 
 // ListxattrAt returns all extended attribute names for the file at the given
 // path.
-func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
+		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return names, nil
@@ -705,10 +705,10 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
 
 // GetxattrAt returns the value associated with the given extended attribute
 // for the file at the given path.
-func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return val, nil
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 8b00ef44c..3231732ec 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -41,12 +41,12 @@ class XattrTest : public FileTest {};
 
 TEST_F(XattrTest, XattrNonexistentFile) {
   const char* path = "/does/not/exist";
-  EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
-              SyscallFailsWithErrno(ENOENT));
-  EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+  const char* name = "user.test";
+  EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
               SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENOENT));
   EXPECT_THAT(listxattr(path, nullptr, 0), SyscallFailsWithErrno(ENOENT));
-  EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(ENOENT));
+  EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(ENOENT));
 }
 
 TEST_F(XattrTest, XattrNullName) {
-- 
cgit v1.2.3


From d303684d7ab9b8a3961398fcf12560956ee9e2e3 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Mon, 13 Apr 2020 16:59:45 -0700
Subject: Remove unnecessary threads

The work being done in these threads is not asynchronous with respect to
the test; that is, it is equivalent to issue non-blocking `connect`
calls serially, since the work is done asynchronously with respect to
the caller. Futhermore, this test was added to test closing a listener
with completed but not delivered connections, which never required
threading in the first place.

PiperOrigin-RevId: 306339486
---
 test/syscalls/linux/socket_inet_loopback.cc | 40 ++++++++---------------------
 1 file changed, 11 insertions(+), 29 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 030c3b835..71bd7c14d 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -325,11 +325,9 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
-  constexpr int kAcceptCount = 32;
-  constexpr int kBacklog = kAcceptCount * 2;
-  constexpr int kFDs = 128;
-  constexpr int kThreadCount = 4;
-  constexpr int kFDsPerThread = kFDs / kThreadCount;
+  constexpr int kAcceptCount = 2;
+  constexpr int kBacklog = kAcceptCount + 2;
+  constexpr int kFDs = kBacklog * 3;
 
   // Create the listening socket.
   FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
@@ -348,39 +346,23 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   uint16_t const port =
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
-  DisableSave ds;  // Too many system calls.
   sockaddr_storage conn_addr = connector.addr;
   ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
-  FileDescriptor clients[kFDs];
-  std::unique_ptr<ScopedThread> threads[kThreadCount];
+  std::vector<FileDescriptor> clients;
   for (int i = 0; i < kFDs; i++) {
-    clients[i] = ASSERT_NO_ERRNO_AND_VALUE(
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
         Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
-  }
-  for (int i = 0; i < kThreadCount; i++) {
-    threads[i] = absl::make_unique<ScopedThread>([&connector, &conn_addr,
-                                                  &clients, i]() {
-      for (int j = 0; j < kFDsPerThread; j++) {
-        int k = i * kFDsPerThread + j;
-        int ret =
-            connect(clients[k].get(), reinterpret_cast<sockaddr*>(&conn_addr),
-                    connector.addr_len);
-        if (ret != 0) {
-          EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
-        }
-      }
-    });
-  }
-  for (int i = 0; i < kThreadCount; i++) {
-    threads[i]->Join();
+    int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                      connector.addr_len);
+    if (ret != 0) {
+      EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+    }
+    clients.push_back(std::move(client));
   }
   for (int i = 0; i < kAcceptCount; i++) {
     auto accepted =
         ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
   }
-  // TODO(b/138400178): Fix cooperative S/R failure when ds.reset() is invoked
-  // before function end.
-  // ds.reset();
 }
 
 TEST_P(SocketInetLoopbackTest, TCPbacklog) {
-- 
cgit v1.2.3


From 71e6ac3e1f551cf52166bf501de114f06502b994 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 13 Apr 2020 17:58:52 -0700
Subject: Don't allow read/write when offset+size overflows.

PiperOrigin-RevId: 306348346
---
 pkg/sentry/syscalls/linux/sys_read.go        |  8 ++++----
 pkg/sentry/syscalls/linux/sys_splice.go      |  4 +++-
 pkg/sentry/syscalls/linux/sys_write.go       |  4 ++--
 pkg/sentry/syscalls/linux/vfs2/read_write.go |  8 ++++----
 test/syscalls/linux/memfd.cc                 |  1 +
 test/syscalls/linux/pread64.cc               | 16 ++++++++++++++++
 test/syscalls/linux/pwrite64.cc              | 12 ++++++++++++
 test/syscalls/linux/sendfile.cc              | 23 +++++++++++++++++++++++
 test/syscalls/linux/splice.cc                |  1 +
 9 files changed, 66 insertions(+), 11 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 78a2cb750..071b4bacc 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -96,8 +96,8 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, syserror.EINVAL
 	}
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
@@ -120,8 +120,8 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 	defer file.DecRef()
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index fbc6cf15f..df0d0f461 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -16,6 +16,7 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -25,7 +26,8 @@ import (
 
 // doSplice implements a blocking splice operation.
 func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
-	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 {
+	log.Infof("NLAC: doSplice opts: %+v", opts)
+	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) {
 		return 0, syserror.EINVAL
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 506ee54ce..6ec0de96e 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -87,8 +87,8 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 	defer file.DecRef()
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
index 35f6308d6..898b190fd 100644
--- a/pkg/sentry/syscalls/linux/vfs2/read_write.go
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -130,8 +130,8 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 	defer file.DecRef()
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
@@ -362,8 +362,8 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 	defer file.DecRef()
 
-	// Check that the offset is legitimate.
-	if offset < 0 {
+	// Check that the offset is legitimate and does not overflow.
+	if offset < 0 || offset+int64(size) < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
index e57b49a4a..f8b7f7938 100644
--- a/test/syscalls/linux/memfd.cc
+++ b/test/syscalls/linux/memfd.cc
@@ -16,6 +16,7 @@
 #include <fcntl.h>
 #include <linux/magic.h>
 #include <linux/memfd.h>
+#include <linux/unistd.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/statfs.h>
diff --git a/test/syscalls/linux/pread64.cc b/test/syscalls/linux/pread64.cc
index 2cecf2e5f..bcdbbb044 100644
--- a/test/syscalls/linux/pread64.cc
+++ b/test/syscalls/linux/pread64.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/mman.h>
 #include <sys/socket.h>
 #include <sys/types.h>
@@ -118,6 +119,21 @@ TEST_F(Pread64Test, EndOfFile) {
   EXPECT_THAT(pread64(fd.get(), buf, 1024, 0), SyscallSucceedsWithValue(0));
 }
 
+int memfd_create(const std::string& name, unsigned int flags) {
+  return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST_F(Pread64Test, Overflow) {
+  int f = memfd_create("negative", 0);
+  const FileDescriptor fd(f);
+
+  EXPECT_THAT(ftruncate(fd.get(), 0x7fffffffffffffffull), SyscallSucceeds());
+
+  char buf[10];
+  EXPECT_THAT(pread64(fd.get(), buf, sizeof(buf), 0x7fffffffffffffffull),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(Pread64TestNoTempFile, CantReadSocketPair_NoRandomSave) {
   int sock_fds[2];
   EXPECT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds), SyscallSucceeds());
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index c2f72e010..e69794910 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -65,6 +66,17 @@ TEST_F(Pwrite64, InvalidArgs) {
   EXPECT_THAT(close(fd), SyscallSucceeds());
 }
 
+TEST_F(Pwrite64, Overflow) {
+  int fd;
+  ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
+  constexpr int64_t kBufSize = 1024;
+  std::vector<char> buf(kBufSize);
+  std::fill(buf.begin(), buf.end(), 'a');
+  EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), 0x7fffffffffffffffull),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index ebaafe47e..64123e904 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/eventfd.h>
 #include <sys/sendfile.h>
 #include <unistd.h>
@@ -70,6 +71,28 @@ TEST(SendFileTest, InvalidOffset) {
               SyscallFailsWithErrno(EINVAL));
 }
 
+int memfd_create(const std::string& name, unsigned int flags) {
+  return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST(SendFileTest, Overflow) {
+  // Create input file.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file.
+  int fd;
+  EXPECT_THAT(fd = memfd_create("overflow", 0), SyscallSucceeds());
+  const FileDescriptor outf(fd);
+
+  // out_offset + kSize overflows INT64_MAX.
+  loff_t out_offset = 0x7ffffffffffffffeull;
+  constexpr int kSize = 3;
+  EXPECT_THAT(sendfile(outf.get(), inf.get(), &out_offset, kSize),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(SendFileTest, SendTrivially) {
   // Create temp files.
   constexpr char kData[] = "To be, or not to be, that is the question:";
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index faa1247f6..f103e2e56 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <fcntl.h>
+#include <linux/unistd.h>
 #include <sys/eventfd.h>
 #include <sys/resource.h>
 #include <sys/sendfile.h>
-- 
cgit v1.2.3


From 9c918340e4e6126cca1dfedbf28fec8c8f836e1a Mon Sep 17 00:00:00 2001
From: Mithun Iyer <iyerm@google.com>
Date: Wed, 15 Apr 2020 01:10:38 -0700
Subject: Reset pending connections on listener close

Attempt to redeliver TCP segments that are enqueued into a closing
TCP endpoint. This was being done for Established endpoints but not
for those that are listening or performing connection handshake.

Fixes #2417

PiperOrigin-RevId: 306598155
---
 pkg/tcpip/transport/tcp/accept.go           |  7 +++-
 pkg/tcpip/transport/tcp/connect.go          | 30 ++++++++------
 pkg/tcpip/transport/tcp/endpoint.go         | 30 +++++++-------
 pkg/tcpip/transport/tcp/tcp_test.go         | 37 +++++++++++++++++
 test/packetimpact/tests/BUILD               |  2 -
 test/syscalls/linux/socket_inet_loopback.cc | 62 +++++++++++++++++++++++++++++
 6 files changed, 138 insertions(+), 30 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 7a9dea4ac..e07b436c4 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -330,6 +330,9 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		if l.listenEP != nil {
 			l.removePendingEndpoint(ep)
 		}
+
+		ep.drainClosingSegmentQueue()
+
 		return nil, err
 	}
 	ep.isConnectNotified = true
@@ -378,7 +381,7 @@ func (e *endpoint) deliverAccepted(n *endpoint) {
 	for {
 		if e.acceptedChan == nil {
 			e.acceptMu.Unlock()
-			n.Close()
+			n.notifyProtocolGoroutine(notifyReset)
 			return
 		}
 		select {
@@ -656,6 +659,8 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 		}
 		e.mu.Unlock()
 
+		e.drainClosingSegmentQueue()
+
 		// Notify waiters that the endpoint is shutdown.
 		e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut | waiter.EventHUp | waiter.EventErr)
 	}()
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 2ca3fb809..994ac52a3 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1062,6 +1062,20 @@ func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 	}
 }
 
+// Drain segment queue from the endpoint and try to re-match the segment to a
+// different endpoint. This is used when the current endpoint is transitioned to
+// StateClose and has been unregistered from the transport demuxer.
+func (e *endpoint) drainClosingSegmentQueue() {
+	for {
+		s := e.segmentQueue.dequeue()
+		if s == nil {
+			break
+		}
+
+		e.tryDeliverSegmentFromClosedEndpoint(s)
+	}
+}
+
 func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 	if e.rcv.acceptable(s.sequenceNumber, 0) {
 		// RFC 793, page 37 states that "in all states
@@ -1315,6 +1329,9 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 		}
 
 		e.mu.Unlock()
+
+		e.drainClosingSegmentQueue()
+
 		// When the protocol loop exits we should wake up our waiters.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}
@@ -1565,19 +1582,6 @@ loop:
 	// Lock released below.
 	epilogue()
 
-	// epilogue removes the endpoint from the transport-demuxer and
-	// unlocks e.mu. Now that no new segments can get enqueued to this
-	// endpoint, try to re-match the segment to a different endpoint
-	// as the current endpoint is closed.
-	for {
-		s := e.segmentQueue.dequeue()
-		if s == nil {
-			break
-		}
-
-		e.tryDeliverSegmentFromClosedEndpoint(s)
-	}
-
 	// A new SYN was received during TIME_WAIT and we need to abort
 	// the timewait and redirect the segment to the listener queue
 	if reuseTW != nil {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index a8d443f73..7ed78d57f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -980,25 +980,22 @@ func (e *endpoint) closeNoShutdownLocked() {
 
 	// Mark endpoint as closed.
 	e.closed = true
+
+	switch e.EndpointState() {
+	case StateClose, StateError:
+		return
+	}
+
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
-	switch e.EndpointState() {
-	// Sockets in StateSynRecv state(passive connections) are closed when
-	// the handshake fails or if the listening socket is closed while
-	// handshake was in progress. In such cases the handshake goroutine
-	// is already gone by the time Close is called and we need to cleanup
-	// here.
-	case StateInitial, StateBound, StateSynRecv:
-		e.cleanupLocked()
-		e.setEndpointState(StateClose)
-	case StateError, StateClose:
-		// do nothing.
-	default:
+	if e.workerRunning {
 		e.workerCleanup = true
 		tcpip.AddDanglingEndpoint(e)
 		// Worker will remove the dangling endpoint when the endpoint
 		// goroutine terminates.
 		e.notifyProtocolGoroutine(notifyClose)
+	} else {
+		e.transitionToStateCloseLocked()
 	}
 }
 
@@ -1010,13 +1007,18 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 		e.acceptMu.Unlock()
 		return
 	}
-
 	close(e.acceptedChan)
+	ch := e.acceptedChan
 	e.acceptedChan = nil
 	e.acceptCond.Broadcast()
 	e.acceptMu.Unlock()
 
-	// Wait for all pending endpoints to close.
+	// Reset all connections that are waiting to be accepted.
+	for n := range ch {
+		n.notifyProtocolGoroutine(notifyReset)
+	}
+	// Wait for reset of all endpoints that are still waiting to be delivered to
+	// the now closed acceptedChan.
 	e.pendingAccepted.Wait()
 }
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 41caa9ed4..a9f121c17 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1068,6 +1068,43 @@ func TestListenShutdown(t *testing.T) {
 	c.CheckNoPacket("Packet received when listening socket was shutdown")
 }
 
+// TestListenCloseWhileConnect tests for the listening endpoint to
+// drain the accept-queue when closed. This should reset all of the
+// pending connections that are waiting to be accepted.
+func TestListenCloseWhileConnect(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1 /* epRcvBuf */)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(1 /* backlog */); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventIn)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+	// Wait for the new endpoint created because of handshake to be delivered
+	// to the listening endpoint's accept queue.
+	<-notifyCh
+
+	// Close the listening endpoint.
+	c.EP.Close()
+
+	// Expect the listening endpoint to reset the connection.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+		))
+}
+
 func TestTOSV4(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 308590162..1274d9f60 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -43,8 +43,6 @@ packetimpact_go_test(
 packetimpact_go_test(
     name = "tcp_noaccept_close_rst",
     srcs = ["tcp_noaccept_close_rst_test.go"],
-    # TODO(b/153380909): Fix netstack then remove the line below.
-    netstack = False,
     deps = [
         "//pkg/tcpip/header",
         "//test/packetimpact/testbench",
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 71bd7c14d..cd84e633a 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -365,6 +365,68 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   }
 }
 
+TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  constexpr int kBacklog = 2;
+  constexpr int kClients = kBacklog + 1;
+
+  // Create the listening socket.
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  std::vector<FileDescriptor> clients;
+  for (int i = 0; i < kClients; i++) {
+    FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+    int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                      connector.addr_len);
+    if (ret != 0) {
+      EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+      clients.push_back(std::move(client));
+    }
+  }
+  // Close the listening socket.
+  listen_fd.reset();
+
+  for (auto& client : clients) {
+    const int kTimeout = 10000;
+    struct pollfd pfd = {
+        .fd = client.get(),
+        .events = POLLIN,
+    };
+    // When the listening socket is closed, then we expect the remote to reset
+    // the connection.
+    ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+    ASSERT_EQ(pfd.revents, POLLIN | POLLHUP | POLLERR);
+    char c;
+    // Subsequent read can fail with:
+    // ECONNRESET: If the client connection was established and was reset by the
+    // remote. ECONNREFUSED: If the client connection failed to be established.
+    ASSERT_THAT(read(client.get(), &c, sizeof(c)),
+                AnyOf(SyscallFailsWithErrno(ECONNRESET),
+                      SyscallFailsWithErrno(ECONNREFUSED)));
+  }
+}
+
 TEST_P(SocketInetLoopbackTest, TCPbacklog) {
   auto const& param = GetParam();
 
-- 
cgit v1.2.3


From ea5b8e9633cd2731bb5656dea523beaf3d643472 Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Wed, 15 Apr 2020 14:30:20 -0700
Subject: Use if_nametoindex to get interface index.

Removed the TODO to use netlink.

PiperOrigin-RevId: 306721468
---
 test/syscalls/linux/ip_socket_test_util.cc | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index d28dc0db6..98d07ae85 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -16,7 +16,6 @@
 
 #include <net/if.h>
 #include <netinet/in.h>
-#include <sys/ioctl.h>
 #include <sys/socket.h>
 
 #include <cstring>
@@ -35,12 +34,11 @@ uint16_t PortFromInetSockaddr(const struct sockaddr* addr) {
 }
 
 PosixErrorOr<int> InterfaceIndex(std::string name) {
-  // TODO(igudger): Consider using netlink.
-  ifreq req = {};
-  memcpy(req.ifr_name, name.c_str(), name.size());
-  ASSIGN_OR_RETURN_ERRNO(auto sock, Socket(AF_INET, SOCK_DGRAM, 0));
-  RETURN_ERROR_IF_SYSCALL_FAIL(ioctl(sock.get(), SIOCGIFINDEX, &req));
-  return req.ifr_ifindex;
+  int index = if_nametoindex(name.c_str());
+  if (index) {
+    return index;
+  }
+  return PosixError(errno);
 }
 
 namespace {
-- 
cgit v1.2.3


From 3b05f576d73be644daa17203d9ed64481c45b4a8 Mon Sep 17 00:00:00 2001
From: Mithun Iyer <iyerm@google.com>
Date: Thu, 16 Apr 2020 17:57:06 -0700
Subject: Reset pending connections on listener shutdown.

When the listening socket is read shutdown, we need to reset all pending
and incoming connections. Ensure that the endpoint is not cleaned up
from the demuxer and subsequent bind to same port does not go through.

PiperOrigin-RevId: 306958038
---
 pkg/tcpip/transport/tcp/accept.go           | 20 +++---
 pkg/tcpip/transport/tcp/connect.go          |  7 ++-
 pkg/tcpip/transport/tcp/endpoint.go         | 30 ++++++---
 pkg/tcpip/transport/tcp/forwarder.go        |  2 +-
 pkg/tcpip/transport/tcp/protocol.go         |  8 +--
 pkg/tcpip/transport/tcp/tcp_test.go         | 16 ++---
 test/syscalls/linux/socket_inet_loopback.cc | 94 +++++++++++++++++++++++++++--
 7 files changed, 138 insertions(+), 39 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index b61c2a8c3..5bb243e3b 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -26,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -433,19 +432,16 @@ func (e *endpoint) acceptQueueIsFull() bool {
 // handleListenSegment is called when a listening endpoint receives a segment
 // and needs to handle it.
 func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
-	if s.flagsAreSet(header.TCPFlagSyn | header.TCPFlagAck) {
+	e.rcvListMu.Lock()
+	rcvClosed := e.rcvClosed
+	e.rcvListMu.Unlock()
+	if rcvClosed || s.flagsAreSet(header.TCPFlagSyn|header.TCPFlagAck) {
+		// If the endpoint is shutdown, reply with reset.
+		//
 		// RFC 793 section 3.4 page 35 (figure 12) outlines that a RST
 		// must be sent in response to a SYN-ACK while in the listen
 		// state to prevent completing a handshake from an old SYN.
-		e.sendTCP(&s.route, tcpFields{
-			id:     s.id,
-			ttl:    e.ttl,
-			tos:    e.sendTOS,
-			flags:  header.TCPFlagRst,
-			seq:    s.ackNumber,
-			ack:    0,
-			rcvWnd: 0,
-		}, buffer.VectorisedView{}, nil)
+		replyWithReset(s, e.sendTOS, e.ttl)
 		return
 	}
 
@@ -534,7 +530,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			// The only time we should reach here when a connection
 			// was opened and closed really quickly and a delayed
 			// ACK was received from the sender.
-			replyWithReset(s)
+			replyWithReset(s, e.sendTOS, e.ttl)
 			return
 		}
 
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 994ac52a3..368865911 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1053,10 +1053,15 @@ func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 		ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, &s.route)
 	}
 	if ep == nil {
-		replyWithReset(s)
+		replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
 		s.decRef()
 		return
 	}
+
+	if e == ep {
+		panic("current endpoint not removed from demuxer, enqueing segments to itself")
+	}
+
 	if ep.(*endpoint).enqueueSegment(s) {
 		ep.(*endpoint).newSegmentWaker.Assert()
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index bffc59e9f..5d0ea9e93 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2101,7 +2101,7 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 	switch {
 	case e.EndpointState().connected():
 		// Close for read.
-		if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
+		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
 			// Mark read side as closed.
 			e.rcvListMu.Lock()
 			e.rcvClosed = true
@@ -2110,7 +2110,7 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 
 			// If we're fully closed and we have unread data we need to abort
 			// the connection with a RST.
-			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
+			if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
 				e.resetConnectionLocked(tcpip.ErrConnectionAborted)
 				// Wake up worker to terminate loop.
 				e.notifyProtocolGoroutine(notifyTickleWorker)
@@ -2119,7 +2119,7 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 		}
 
 		// Close for write.
-		if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 {
+		if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
 			e.sndBufMu.Lock()
 			if e.sndClosed {
 				// Already closed.
@@ -2142,12 +2142,23 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 
 		return nil
 	case e.EndpointState() == StateListen:
-		// Tell protocolListenLoop to stop.
-		if flags&tcpip.ShutdownRead != 0 {
-			e.notifyProtocolGoroutine(notifyClose)
+		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
+			// Reset all connections from the accept queue and keep the
+			// worker running so that it can continue handling incoming
+			// segments by replying with RST.
+			//
+			// By not removing this endpoint from the demuxer mapping, we
+			// ensure that any other bind to the same port fails, as on Linux.
+			// TODO(gvisor.dev/issue/2468): We need to enable applications to
+			// start listening on this endpoint again similar to Linux.
+			e.rcvListMu.Lock()
+			e.rcvClosed = true
+			e.rcvListMu.Unlock()
+			e.closePendingAcceptableConnectionsLocked()
+			// Notify waiters that the endpoint is shutdown.
+			e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut | waiter.EventHUp | waiter.EventErr)
 		}
 		return nil
-
 	default:
 		return tcpip.ErrNotConnected
 	}
@@ -2251,8 +2262,11 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	e.LockUser()
 	defer e.UnlockUser()
 
+	e.rcvListMu.Lock()
+	rcvClosed := e.rcvClosed
+	e.rcvListMu.Unlock()
 	// Endpoint must be in listen state before it can accept connections.
-	if e.EndpointState() != StateListen {
+	if rcvClosed || e.EndpointState() != StateListen {
 		return nil, nil, tcpip.ErrInvalidEndpointState
 	}
 
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 808410c92..704d01c64 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -130,7 +130,7 @@ func (r *ForwarderRequest) Complete(sendReset bool) {
 
 	// If the caller requested, send a reset.
 	if sendReset {
-		replyWithReset(r.segment)
+		replyWithReset(r.segment, stack.DefaultTOS, r.segment.route.DefaultTTL())
 	}
 
 	// Release all resources.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index effbf203f..cfd9a4e8e 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -223,12 +223,12 @@ func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Transpo
 		return true
 	}
 
-	replyWithReset(s)
+	replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
 	return true
 }
 
 // replyWithReset replies to the given segment with a reset segment.
-func replyWithReset(s *segment) {
+func replyWithReset(s *segment, tos, ttl uint8) {
 	// Get the seqnum from the packet if the ack flag is set.
 	seq := seqnum.Value(0)
 	ack := seqnum.Value(0)
@@ -252,8 +252,8 @@ func replyWithReset(s *segment) {
 	}
 	sendTCP(&s.route, tcpFields{
 		id:     s.id,
-		ttl:    s.route.DefaultTTL(),
-		tos:    stack.DefaultTOS,
+		ttl:    ttl,
+		tos:    tos,
 		flags:  flags,
 		seq:    seq,
 		ack:    ack,
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 74fb6e064..ab1014c7f 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1034,8 +1034,8 @@ func TestSendRstOnListenerRxAckV6(t *testing.T) {
 		checker.SeqNum(200)))
 }
 
-// TestListenShutdown tests for the listening endpoint not processing
-// any receive when it is on read shutdown.
+// TestListenShutdown tests for the listening endpoint replying with RST
+// on read shutdown.
 func TestListenShutdown(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -1046,7 +1046,7 @@ func TestListenShutdown(t *testing.T) {
 		t.Fatal("Bind failed:", err)
 	}
 
-	if err := c.EP.Listen(10 /* backlog */); err != nil {
+	if err := c.EP.Listen(1 /* backlog */); err != nil {
 		t.Fatal("Listen failed:", err)
 	}
 
@@ -1054,9 +1054,6 @@ func TestListenShutdown(t *testing.T) {
 		t.Fatal("Shutdown failed:", err)
 	}
 
-	// Wait for the endpoint state to be propagated.
-	time.Sleep(10 * time.Millisecond)
-
 	c.SendPacket(nil, &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: context.StackPort,
@@ -1065,7 +1062,12 @@ func TestListenShutdown(t *testing.T) {
 		AckNum:  200,
 	})
 
-	c.CheckNoPacket("Packet received when listening socket was shutdown")
+	// Expect the listening endpoint to reset the connection.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+		))
 }
 
 // TestListenCloseWhileConnect tests for the listening endpoint to
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index cd84e633a..d3000dbc6 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -319,6 +319,75 @@ TEST_P(SocketInetLoopbackTest, TCPListenUnbound) {
   tcpSimpleConnectTest(listener, connector, false);
 }
 
+TEST_P(SocketInetLoopbackTest, TCPListenShutdown) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  constexpr int kBacklog = 2;
+  constexpr int kFDs = kBacklog + 1;
+
+  // Create the listening socket.
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+
+  // Shutdown the write of the listener, expect to not have any effect.
+  ASSERT_THAT(shutdown(listen_fd.get(), SHUT_WR), SyscallSucceeds());
+
+  for (int i = 0; i < kFDs; i++) {
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                        connector.addr_len),
+                SyscallSucceeds());
+    ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr), SyscallSucceeds());
+  }
+
+  // Shutdown the read of the listener, expect to fail subsequent
+  // server accepts, binds and client connects.
+  ASSERT_THAT(shutdown(listen_fd.get(), SHUT_RD), SyscallSucceeds());
+
+  ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+              SyscallFailsWithErrno(EINVAL));
+
+  // Check that shutdown did not release the port.
+  FileDescriptor new_listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(
+      bind(new_listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+           listener.addr_len),
+      SyscallFailsWithErrno(EADDRINUSE));
+
+  // Check that subsequent connection attempts receive a RST.
+  auto client = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+  for (int i = 0; i < kFDs; i++) {
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                        connector.addr_len),
+                SyscallFailsWithErrno(ECONNREFUSED));
+  }
+}
+
 TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   auto const& param = GetParam();
 
@@ -365,9 +434,8 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
   }
 }
 
-TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
-  auto const& param = GetParam();
-
+void TestListenWhileConnect(const TestParam& param,
+                            void (*stopListen)(FileDescriptor&)) {
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
@@ -404,8 +472,8 @@ TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
       clients.push_back(std::move(client));
     }
   }
-  // Close the listening socket.
-  listen_fd.reset();
+
+  stopListen(listen_fd);
 
   for (auto& client : clients) {
     const int kTimeout = 10000;
@@ -420,13 +488,26 @@ TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
     char c;
     // Subsequent read can fail with:
     // ECONNRESET: If the client connection was established and was reset by the
-    // remote. ECONNREFUSED: If the client connection failed to be established.
+    // remote.
+    // ECONNREFUSED: If the client connection failed to be established.
     ASSERT_THAT(read(client.get(), &c, sizeof(c)),
                 AnyOf(SyscallFailsWithErrno(ECONNRESET),
                       SyscallFailsWithErrno(ECONNREFUSED)));
   }
 }
 
+TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
+  TestListenWhileConnect(GetParam(), [](FileDescriptor& f) {
+    ASSERT_THAT(close(f.release()), SyscallSucceeds());
+  });
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenShutdownWhileConnect) {
+  TestListenWhileConnect(GetParam(), [](FileDescriptor& f) {
+    ASSERT_THAT(shutdown(f.get(), SHUT_RD), SyscallSucceeds());
+  });
+}
+
 TEST_P(SocketInetLoopbackTest, TCPbacklog) {
   auto const& param = GetParam();
 
@@ -1134,6 +1215,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
               if (connects_received >= kConnectAttempts) {
                 // Another thread have shutdown our read side causing the
                 // accept to fail.
+                ASSERT_EQ(errno, EINVAL);
                 break;
               }
               ASSERT_NO_ERRNO(fd);
-- 
cgit v1.2.3


From f03996c5e9803934226e4b3a10827501cb936ab9 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 16 Apr 2020 19:26:02 -0700
Subject: Implement pipe(2) and pipe2(2) for VFS2.

Updates #1035

PiperOrigin-RevId: 306968644
---
 pkg/sentry/fsimpl/pipefs/BUILD                     |  20 +++
 pkg/sentry/fsimpl/pipefs/pipefs.go                 | 148 +++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/filesystem.go              |   2 +-
 pkg/sentry/fsimpl/tmpfs/named_pipe.go              |  23 +--
 pkg/sentry/fsimpl/tmpfs/tmpfs.go                   |   2 +-
 pkg/sentry/kernel/BUILD                            |   1 +
 pkg/sentry/kernel/kernel.go                        |  30 +++-
 pkg/sentry/kernel/pipe/vfs.go                      | 162 ++++++++++++---------
 pkg/sentry/syscalls/linux/sys_pipe.go              |  14 +-
 pkg/sentry/syscalls/linux/vfs2/BUILD               |   3 +
 pkg/sentry/syscalls/linux/vfs2/fd.go               |  17 +++
 .../syscalls/linux/vfs2/linux64_override_amd64.go  |   4 +-
 pkg/sentry/syscalls/linux/vfs2/pipe.go             |  63 ++++++++
 pkg/sentry/syscalls/linux/vfs2/read_write.go       |   8 +-
 pkg/sentry/vfs/vfs.go                              |   2 +-
 test/syscalls/linux/pipe.cc                        |   2 +
 16 files changed, 389 insertions(+), 112 deletions(-)
 create mode 100644 pkg/sentry/fsimpl/pipefs/BUILD
 create mode 100644 pkg/sentry/fsimpl/pipefs/pipefs.go
 create mode 100644 pkg/sentry/syscalls/linux/vfs2/pipe.go

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD
new file mode 100644
index 000000000..0d411606f
--- /dev/null
+++ b/pkg/sentry/fsimpl/pipefs/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "pipefs",
+    srcs = ["pipefs.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
new file mode 100644
index 000000000..faf3179bc
--- /dev/null
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -0,0 +1,148 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipefs provides the filesystem implementation backing
+// Kernel.PipeMount.
+package pipefs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type filesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (filesystemType) Name() string {
+	return "pipefs"
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	panic("pipefs.filesystemType.GetFilesystem should never be called")
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+
+	// TODO(gvisor.dev/issue/1193):
+	//
+	// - kernfs does not provide a way to implement statfs, from which we
+	// should indicate PIPEFS_MAGIC.
+	//
+	// - kernfs does not provide a way to override names for
+	// vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic
+	// name fmt.Sprintf("pipe:[%d]", inode.ino).
+}
+
+// NewFilesystem sets up and returns a new vfs.Filesystem implemented by
+// pipefs.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem {
+	fs := &filesystem{}
+	fs.Init(vfsObj, filesystemType{})
+	return fs.VFSFilesystem()
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoopRefCount
+
+	pipe *pipe.VFSPipe
+
+	ino uint64
+	uid auth.KUID
+	gid auth.KGID
+	// We use the creation timestamp for all of atime, mtime, and ctime.
+	ctime ktime.Time
+}
+
+func newInode(ctx context.Context, fs *kernfs.Filesystem) *inode {
+	creds := auth.CredentialsFromContext(ctx)
+	return &inode{
+		pipe:  pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+		ino:   fs.NextIno(),
+		uid:   creds.EffectiveKUID,
+		gid:   creds.EffectiveKGID,
+		ctime: ktime.NowFromContext(ctx),
+	}
+}
+
+const pipeMode = 0600 | linux.S_IFIFO
+
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
+func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, pipeMode, i.uid, i.gid)
+}
+
+// Mode implements kernfs.Inode.Mode.
+func (i *inode) Mode() linux.FileMode {
+	return pipeMode
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds())
+	return linux.Statx{
+		Mask:    linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
+		Blksize: usermem.PageSize,
+		Nlink:   1,
+		UID:     uint32(i.uid),
+		GID:     uint32(i.gid),
+		Mode:    pipeMode,
+		Ino:     i.ino,
+		Size:    0,
+		Blocks:  0,
+		Atime:   ts,
+		Ctime:   ts,
+		Mtime:   ts,
+		// TODO(gvisor.dev/issue/1197): Device number.
+	}, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	return syserror.EPERM
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// FIXME(b/38173783): kernfs does not plumb Context here.
+	return i.pipe.Open(context.Background(), rp.Mount(), vfsd, opts.Flags)
+}
+
+// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
+// and write ends of a newly-created pipe, as for pipe(2) and pipe2(2).
+//
+// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
+func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
+	fs := mnt.Filesystem().Impl().(*kernfs.Filesystem)
+	inode := newInode(ctx, fs)
+	var d kernfs.Dentry
+	d.Init(inode)
+	defer d.DecRef()
+	return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags)
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index f4d50d64f..660f5a29b 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -392,7 +392,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 		// Can't open symlinks without O_PATH (which is unimplemented).
 		return nil, syserror.ELOOP
 	case *namedPipe:
-		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags)
+		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags)
 	case *deviceFile:
 		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
 	case *socketFile:
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 2c5c739df..8d77b3fa8 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -16,10 +16,8 @@ package tmpfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
@@ -33,27 +31,8 @@ type namedPipe struct {
 //   * fs.mu must be locked.
 //   * rp.Mount().CheckBeginWrite() has been called successfully.
 func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
-	file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
+	file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
 	file.inode.init(file, fs, creds, linux.S_IFIFO|mode)
 	file.inode.nlink = 1 // Only the parent has a link.
 	return &file.inode
 }
-
-// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented
-// entirely via struct embedding.
-type namedPipeFD struct {
-	fileDescription
-
-	*pipe.VFSPipeFD
-}
-
-func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
-	var err error
-	var fd namedPipeFD
-	fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, vfsd, &fd.vfsfd, flags)
-	if err != nil {
-		return nil, err
-	}
-	fd.vfsfd.Init(&fd, flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{})
-	return &fd.vfsfd, nil
-}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 9fa8637d5..a59b24d45 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -357,6 +357,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 		return err
 	}
 	i.mu.Lock()
+	defer i.mu.Unlock()
 	var (
 		needsMtimeBump bool
 		needsCtimeBump bool
@@ -427,7 +428,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 		atomic.StoreInt64(&i.ctime, now)
 	}
 
-	i.mu.Unlock()
 	return nil
 }
 
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index e0ff58d8c..e47af66d6 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -170,6 +170,7 @@ go_library(
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/fsbridge",
         "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/fsimpl/pipefs",
         "//pkg/sentry/fsimpl/sockfs",
         "//pkg/sentry/hostcpu",
         "//pkg/sentry/inet",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index de8a95854..fef60e636 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -50,6 +50,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
@@ -254,6 +255,10 @@ type Kernel struct {
 	// VFS keeps the filesystem state used across the kernel.
 	vfs vfs.VirtualFilesystem
 
+	// pipeMount is the Mount used for pipes created by the pipe() and pipe2()
+	// syscalls (as opposed to named pipes created by mknod()).
+	pipeMount *vfs.Mount
+
 	// If set to true, report address space activation waits as if the task is in
 	// external wait so that the watchdog doesn't report the task stuck.
 	SleepForAddressSpaceActivation bool
@@ -354,19 +359,29 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
 	k.futexes = futex.NewManager()
 	k.netlinkPorts = port.New()
+
 	if VFS2Enabled {
 		if err := k.vfs.Init(); err != nil {
 			return fmt.Errorf("failed to initialize VFS: %v", err)
 		}
-		fs := sockfs.NewFilesystem(&k.vfs)
-		// NewDisconnectedMount will take an additional reference on fs.
-		defer fs.DecRef()
-		sm, err := k.vfs.NewDisconnectedMount(fs, nil, &vfs.MountOptions{})
+
+		pipeFilesystem := pipefs.NewFilesystem(&k.vfs)
+		defer pipeFilesystem.DecRef()
+		pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to create pipefs mount: %v", err)
+		}
+		k.pipeMount = pipeMount
+
+		socketFilesystem := sockfs.NewFilesystem(&k.vfs)
+		defer socketFilesystem.DecRef()
+		socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
 		if err != nil {
 			return fmt.Errorf("failed to initialize socket mount: %v", err)
 		}
-		k.socketMount = sm
+		k.socketMount = socketMount
 	}
+
 	return nil
 }
 
@@ -1613,3 +1628,8 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
 func (k *Kernel) VFS() *vfs.VirtualFilesystem {
 	return &k.vfs
 }
+
+// PipeMount returns the pipefs mount.
+func (k *Kernel) PipeMount() *vfs.Mount {
+	return k.pipeMount
+}
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index a5675bd70..b54f08a30 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -49,38 +49,42 @@ type VFSPipe struct {
 }
 
 // NewVFSPipe returns an initialized VFSPipe.
-func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe {
+func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
 	var vp VFSPipe
-	initPipe(&vp.pipe, true /* isNamed */, sizeBytes, atomicIOBytes)
+	initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes)
 	return &vp
 }
 
-// NewVFSPipeFD opens a named pipe. Named pipes have special blocking semantics
-// during open:
+// ReaderWriterPair returns read-only and write-only FDs for vp.
 //
-// "Normally, opening the FIFO blocks until the other end is opened also. A
-// process can open a FIFO in nonblocking mode. In this case, opening for
-// read-only will succeed even if no-one has opened on the write side yet,
-// opening for write-only will fail with ENXIO (no such device or address)
-// unless the other end has already been opened. Under Linux, opening a FIFO
-// for read and write will succeed both in blocking and nonblocking mode. POSIX
-// leaves this behavior undefined. This can be used to open a FIFO for writing
-// while there are no readers available." - fifo(7)
-func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
+// Preconditions: statusFlags should not contain an open access mode.
+func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
+	return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags)
+}
+
+// Open opens the pipe represented by vp.
+func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) {
 	vp.mu.Lock()
 	defer vp.mu.Unlock()
 
-	readable := vfs.MayReadFileWithOpenFlags(flags)
-	writable := vfs.MayWriteFileWithOpenFlags(flags)
+	readable := vfs.MayReadFileWithOpenFlags(statusFlags)
+	writable := vfs.MayWriteFileWithOpenFlags(statusFlags)
 	if !readable && !writable {
 		return nil, syserror.EINVAL
 	}
 
-	vfd, err := vp.open(vfsd, vfsfd, flags)
-	if err != nil {
-		return nil, err
-	}
+	fd := vp.newFD(mnt, vfsd, statusFlags)
 
+	// Named pipes have special blocking semantics during open:
+	//
+	// "Normally, opening the FIFO blocks until the other end is opened also. A
+	// process can open a FIFO in nonblocking mode. In this case, opening for
+	// read-only will succeed even if no-one has opened on the write side yet,
+	// opening for write-only will fail with ENXIO (no such device or address)
+	// unless the other end has already been opened. Under Linux, opening a
+	// FIFO for read and write will succeed both in blocking and nonblocking
+	// mode. POSIX leaves this behavior undefined. This can be used to open a
+	// FIFO for writing while there are no readers available." - fifo(7)
 	switch {
 	case readable && writable:
 		// Pipes opened for read-write always succeed without blocking.
@@ -89,23 +93,26 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf
 
 	case readable:
 		newHandleLocked(&vp.rWakeup)
-		// If this pipe is being opened as nonblocking and there's no
+		// If this pipe is being opened as blocking and there's no
 		// writer, we have to wait for a writer to open the other end.
-		if flags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
+		if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
+			fd.DecRef()
 			return nil, syserror.EINTR
 		}
 
 	case writable:
 		newHandleLocked(&vp.wWakeup)
 
-		if !vp.pipe.HasReaders() {
-			// Nonblocking, write-only opens fail with ENXIO when
-			// the read side isn't open yet.
-			if flags&linux.O_NONBLOCK != 0 {
+		if vp.pipe.isNamed && !vp.pipe.HasReaders() {
+			// Non-blocking, write-only opens fail with ENXIO when the read
+			// side isn't open yet.
+			if statusFlags&linux.O_NONBLOCK != 0 {
+				fd.DecRef()
 				return nil, syserror.ENXIO
 			}
 			// Wait for a reader to open the other end.
 			if !waitFor(&vp.mu, &vp.rWakeup, ctx) {
+				fd.DecRef()
 				return nil, syserror.EINTR
 			}
 		}
@@ -114,96 +121,93 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf
 		panic("invalid pipe flags: must be readable, writable, or both")
 	}
 
-	return vfd, nil
+	return fd, nil
 }
 
 // Preconditions: vp.mu must be held.
-func (vp *VFSPipe) open(vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) {
-	var fd VFSPipeFD
-	fd.flags = flags
-	fd.readable = vfs.MayReadFileWithOpenFlags(flags)
-	fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
-	fd.vfsfd = vfsfd
-	fd.pipe = &vp.pipe
+func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription {
+	fd := &VFSPipeFD{
+		pipe: &vp.pipe,
+	}
+	fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
+		UseDentryMetadata: true,
+	})
 
 	switch {
-	case fd.readable && fd.writable:
+	case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
 		vp.pipe.rOpen()
 		vp.pipe.wOpen()
-	case fd.readable:
+	case fd.vfsfd.IsReadable():
 		vp.pipe.rOpen()
-	case fd.writable:
+	case fd.vfsfd.IsWritable():
 		vp.pipe.wOpen()
 	default:
 		panic("invalid pipe flags: must be readable, writable, or both")
 	}
 
-	return &fd, nil
+	return &fd.vfsfd
 }
 
-// VFSPipeFD implements a subset of vfs.FileDescriptionImpl for pipes. It is
-// expected that filesystesm will use this in a struct implementing
-// vfs.FileDescriptionImpl.
+// VFSPipeFD implements vfs.FileDescriptionImpl for pipes.
 type VFSPipeFD struct {
-	pipe     *Pipe
-	flags    uint32
-	readable bool
-	writable bool
-	vfsfd    *vfs.FileDescription
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	pipe *Pipe
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *VFSPipeFD) Release() {
 	var event waiter.EventMask
-	if fd.readable {
+	if fd.vfsfd.IsReadable() {
 		fd.pipe.rClose()
-		event |= waiter.EventIn
+		event |= waiter.EventOut
 	}
-	if fd.writable {
+	if fd.vfsfd.IsWritable() {
 		fd.pipe.wClose()
-		event |= waiter.EventOut
+		event |= waiter.EventIn | waiter.EventHUp
 	}
 	if event == 0 {
 		panic("invalid pipe flags: must be readable, writable, or both")
 	}
 
-	if fd.writable {
-		fd.vfsfd.VirtualDentry().Mount().EndWrite()
-	}
-
 	fd.pipe.Notify(event)
 }
 
-// OnClose implements vfs.FileDescriptionImpl.OnClose.
-func (fd *VFSPipeFD) OnClose(_ context.Context) error {
-	return nil
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	switch {
+	case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
+		return fd.pipe.rwReadiness()
+	case fd.vfsfd.IsReadable():
+		return fd.pipe.rReadiness()
+	case fd.vfsfd.IsWritable():
+		return fd.pipe.wReadiness()
+	default:
+		panic("pipe FD is neither readable nor writable")
+	}
 }
 
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *VFSPipeFD) PRead(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.ReadOptions) (int64, error) {
-	return 0, syserror.ESPIPE
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.pipe.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) {
+	fd.pipe.EventUnregister(e)
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
 func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
-	if !fd.readable {
-		return 0, syserror.EINVAL
-	}
-
 	return fd.pipe.Read(ctx, dst)
 }
 
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *VFSPipeFD) PWrite(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.WriteOptions) (int64, error) {
-	return 0, syserror.ESPIPE
-}
-
 // Write implements vfs.FileDescriptionImpl.Write.
 func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
-	if !fd.writable {
-		return 0, syserror.EINVAL
-	}
-
 	return fd.pipe.Write(ctx, src)
 }
 
@@ -211,3 +215,17 @@ func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.Wr
 func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	return fd.pipe.Ioctl(ctx, uio, args)
 }
+
+// PipeSize implements fcntl(F_GETPIPE_SZ).
+func (fd *VFSPipeFD) PipeSize() int64 {
+	// Inline Pipe.FifoSize() rather than calling it with nil Context and
+	// fs.File and ignoring the returned error (which is always nil).
+	fd.pipe.mu.Lock()
+	defer fd.pipe.mu.Unlock()
+	return fd.pipe.max
+}
+
+// SetPipeSize implements fcntl(F_SETPIPE_SZ).
+func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
+	return fd.pipe.SetFifoSize(size)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 798344042..43c510930 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -24,6 +24,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// LINT.IfChange
+
 // pipe2 implements the actual system call with flags.
 func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
 	if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
@@ -45,10 +47,12 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
 	}
 
 	if _, err := t.CopyOut(addr, fds); err != nil {
-		// The files are not closed in this case, the exact semantics
-		// of this error case are not well defined, but they could have
-		// already been observed by user space.
-		return 0, syserror.EFAULT
+		for _, fd := range fds {
+			if file, _ := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
+		return 0, err
 	}
 	return 0, nil
 }
@@ -69,3 +73,5 @@ func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	n, err := pipe2(t, addr, flags)
 	return n, nil, err
 }
+
+// LINT.ThenChange(vfs2/pipe.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index b32abfe59..6ff2d84d2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -18,6 +18,7 @@ go_library(
         "linux64_override_arm64.go",
         "mmap.go",
         "path.go",
+        "pipe.go",
         "poll.go",
         "read_write.go",
         "setstat.go",
@@ -39,8 +40,10 @@ go_library(
         "//pkg/gohacks",
         "//pkg/sentry/arch",
         "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/pipefs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index 3afcea665..8181d80f4 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -140,6 +141,22 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return uintptr(file.StatusFlags()), nil, nil
 	case linux.F_SETFL:
 		return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
+	case linux.F_SETPIPE_SZ:
+		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+		if !ok {
+			return 0, nil, syserror.EBADF
+		}
+		n, err := pipefile.SetPipeSize(int64(args[2].Int()))
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(n), nil, nil
+	case linux.F_GETPIPE_SZ:
+		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
+		if !ok {
+			return 0, nil, syserror.EBADF
+		}
+		return uintptr(pipefile.PipeSize()), nil, nil
 	default:
 		// TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index 645e0bcb8..21eb98444 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -39,7 +39,7 @@ func Override(table map[uintptr]kernel.Syscall) {
 	table[19] = syscalls.Supported("readv", Readv)
 	table[20] = syscalls.Supported("writev", Writev)
 	table[21] = syscalls.Supported("access", Access)
-	delete(table, 22) // pipe
+	table[22] = syscalls.Supported("pipe", Pipe)
 	table[23] = syscalls.Supported("select", Select)
 	table[32] = syscalls.Supported("dup", Dup)
 	table[33] = syscalls.Supported("dup2", Dup2)
@@ -151,7 +151,7 @@ func Override(table map[uintptr]kernel.Syscall) {
 	delete(table, 290) // eventfd2
 	table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
 	table[292] = syscalls.Supported("dup3", Dup3)
-	delete(table, 293) // pipe2
+	table[293] = syscalls.Supported("pipe2", Pipe2)
 	delete(table, 294) // inotify_init1
 	table[295] = syscalls.Supported("preadv", Preadv)
 	table[296] = syscalls.Supported("pwritev", Pwritev)
diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go
new file mode 100644
index 000000000..4a01e4209
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go
@@ -0,0 +1,63 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Pipe implements Linux syscall pipe(2).
+func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	return 0, nil, pipe2(t, addr, 0)
+}
+
+// Pipe2 implements Linux syscall pipe2(2).
+func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Int()
+	return 0, nil, pipe2(t, addr, flags)
+}
+
+func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error {
+	if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
+		return syserror.EINVAL
+	}
+	r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK))
+	defer r.DecRef()
+	defer w.DecRef()
+
+	fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{
+		CloseOnExec: flags&linux.O_CLOEXEC != 0,
+	})
+	if err != nil {
+		return err
+	}
+	if _, err := t.CopyOut(addr, fds); err != nil {
+		for _, fd := range fds {
+			if _, file := t.FDTable().Remove(fd); file != nil {
+				file.DecRef()
+			}
+		}
+		return err
+	}
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
index 898b190fd..6c6998f45 100644
--- a/pkg/sentry/syscalls/linux/vfs2/read_write.go
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -103,7 +103,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt
 
 		// Issue the request and break out if it completes with anything other than
 		// "would block".
-		n, err := file.Read(t, dst, opts)
+		n, err = file.Read(t, dst, opts)
 		total += n
 		if err != syserror.ErrWouldBlock {
 			break
@@ -248,7 +248,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of
 
 		// Issue the request and break out if it completes with anything other than
 		// "would block".
-		n, err := file.PRead(t, dst, offset+total, opts)
+		n, err = file.PRead(t, dst, offset+total, opts)
 		total += n
 		if err != syserror.ErrWouldBlock {
 			break
@@ -335,7 +335,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op
 
 		// Issue the request and break out if it completes with anything other than
 		// "would block".
-		n, err := file.Write(t, src, opts)
+		n, err = file.Write(t, src, opts)
 		total += n
 		if err != syserror.ErrWouldBlock {
 			break
@@ -480,7 +480,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o
 
 		// Issue the request and break out if it completes with anything other than
 		// "would block".
-		n, err := file.PWrite(t, src, offset+total, opts)
+		n, err = file.PWrite(t, src, offset+total, opts)
 		total += n
 		if err != syserror.ErrWouldBlock {
 			break
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 053c6e1d1..cb5bbd781 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -335,7 +335,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
 	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
-		if err != nil {
+		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index d8e19e910..67228b66b 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -265,6 +265,8 @@ TEST_P(PipeTest, OffsetCalls) {
               SyscallFailsWithErrno(ESPIPE));
 
   struct iovec iov;
+  iov.iov_base = &buf;
+  iov.iov_len = sizeof(buf);
   EXPECT_THAT(preadv(wfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
   EXPECT_THAT(pwritev(rfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
 }
-- 
cgit v1.2.3


From 4a818d64378f16f3738ba51c7804cff90f753b1d Mon Sep 17 00:00:00 2001
From: Ting-Yu Wang <anivia@google.com>
Date: Fri, 17 Apr 2020 10:33:54 -0700
Subject: proc net test: Annotate disable-save test with NoRandomSave.

PiperOrigin-RevId: 307069884
---
 test/syscalls/linux/proc_net.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 4e23d1e78..cac394910 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -353,7 +353,7 @@ TEST(ProcNetSnmp, UdpNoPorts_NoRandomSave) {
   EXPECT_EQ(oldNoPorts, newNoPorts - 1);
 }
 
-TEST(ProcNetSnmp, UdpIn) {
+TEST(ProcNetSnmp, UdpIn_NoRandomSave) {
   // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
   const DisableSave ds;
 
-- 
cgit v1.2.3


From e838290e671c9d72dbaa3aba13bf0c35f1147de4 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 17 Apr 2020 15:31:51 -0700
Subject: prlimit: don't check credentials on self

prlimit was erroneously comparing UIDs and GIDs when getting/setting a process'
own limits. From the manpage:

To set or get the resources of a process other than itself, the caller must have
the CAP_SYS_RESOURCE capability, or the real, effective, and saved set user IDs
of the target process must match the real user ID of the caller and the real,
effective, and saved set group IDs of the target process must match the real
group ID of the caller.

PiperOrigin-RevId: 307127266
---
 pkg/sentry/syscalls/linux/sys_rlimit.go |  2 +-
 test/syscalls/linux/uidgid.cc           | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index e08c333d6..d5d5b6959 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -197,7 +197,7 @@ func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	// saved set user IDs of the target process must match the real user ID of
 	// the caller and the real, effective, and saved set group IDs of the
 	// target process must match the real group ID of the caller."
-	if !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) {
+	if ot != t && !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) {
 		cred, tcred := t.Credentials(), ot.Credentials()
 		if cred.RealKUID != tcred.RealKUID ||
 			cred.RealKUID != tcred.EffectiveKUID ||
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index 6218fbce1..ff66a79f4 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -14,6 +14,7 @@
 
 #include <errno.h>
 #include <grp.h>
+#include <sys/resource.h>
 #include <sys/types.h>
 #include <unistd.h>
 
@@ -249,6 +250,17 @@ TEST(UidGidRootTest, Setgroups) {
               SyscallFailsWithErrno(EFAULT));
 }
 
+TEST(UidGidRootTest, Setuid_prlimit) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+  // Change our UID.
+  EXPECT_THAT(seteuid(65534), SyscallSucceeds());
+
+  // Despite the UID change, we should be able to get our own limits.
+  struct rlimit rl = {};
+  ASSERT_THAT(prlimit(0, RLIMIT_NOFILE, NULL, &rl), SyscallSucceeds());
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 639c8dd80870133f61465588e717b725417a0c41 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 21 Apr 2020 10:56:04 -0700
Subject: Restore euid upon test finish

PiperOrigin-RevId: 307638329
---
 test/syscalls/linux/uidgid.cc | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index ff66a79f4..64d6d0b8f 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -253,12 +253,21 @@ TEST(UidGidRootTest, Setgroups) {
 TEST(UidGidRootTest, Setuid_prlimit) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
 
-  // Change our UID.
-  EXPECT_THAT(seteuid(65534), SyscallSucceeds());
+  // Do seteuid in a separate thread so that after finishing this test, the
+  // process can still open files the test harness created before starting this
+  // test. Otherwise, the files are created by root (UID before the test), but
+  // cannot be opened by the `uid` set below after the test.
+  ScopedThread([&] {
+    // Use syscall instead of glibc setuid wrapper because we want this seteuid
+    // call to only apply to this task. POSIX threads, however, require that all
+    // threads have the same UIDs, so using the seteuid wrapper sets all
+    // threads' UID.
+    EXPECT_THAT(syscall(SYS_setreuid, -1, 65534), SyscallSucceeds());
 
-  // Despite the UID change, we should be able to get our own limits.
-  struct rlimit rl = {};
-  ASSERT_THAT(prlimit(0, RLIMIT_NOFILE, NULL, &rl), SyscallSucceeds());
+    // Despite the UID change, we should be able to get our own limits.
+    struct rlimit rl = {};
+    EXPECT_THAT(prlimit(0, RLIMIT_NOFILE, NULL, &rl), SyscallSucceeds());
+  });
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 37f863f62813f76b05979494c1bc2fe102629321 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 22 Apr 2020 14:15:33 -0700
Subject: tcp: handle listen after shutdown properly

Right now, sentry panics in this case:
panic: close of nil channel

goroutine 67 [running]:
pkg/tcpip/transport/tcp/tcp.(*endpoint).listen(0xc0000ce000, 0x9, 0x0)
        pkg/tcpip/transport/tcp/endpoint.go:2208 +0x170
pkg/tcpip/transport/tcp/tcp.(*endpoint).Listen(0xc0000ce000, 0x9, 0xc0003a1ad0)
        pkg/tcpip/transport/tcp/endpoint.go:2179 +0x50

Fixes #2468

PiperOrigin-RevId: 307896725
---
 pkg/tcpip/transport/tcp/endpoint.go         | 43 +++++++++++++++--------------
 pkg/tcpip/transport/tcp/endpoint_state.go   |  5 ++++
 test/syscalls/linux/socket_inet_loopback.cc | 43 +++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 20 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 45f2aa78b..07d3e64c8 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2158,8 +2158,6 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
 			//
 			// By not removing this endpoint from the demuxer mapping, we
 			// ensure that any other bind to the same port fails, as on Linux.
-			// TODO(gvisor.dev/issue/2468): We need to enable applications to
-			// start listening on this endpoint again similar to Linux.
 			e.rcvListMu.Lock()
 			e.rcvClosed = true
 			e.rcvListMu.Unlock()
@@ -2188,26 +2186,31 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	e.LockUser()
 	defer e.UnlockUser()
 
-	// Allow the backlog to be adjusted if the endpoint is not shutting down.
-	// When the endpoint shuts down, it sets workerCleanup to true, and from
-	// that point onward, acceptedChan is the responsibility of the cleanup()
-	// method (and should not be touched anywhere else, including here).
-	if e.EndpointState() == StateListen && !e.workerCleanup {
-		// Adjust the size of the channel iff we can fix existing
-		// pending connections into the new one.
+	if e.EndpointState() == StateListen && !e.closed {
 		e.acceptMu.Lock()
 		defer e.acceptMu.Unlock()
-		if len(e.acceptedChan) > backlog {
-			return tcpip.ErrInvalidEndpointState
-		}
-		if cap(e.acceptedChan) == backlog {
-			return nil
-		}
-		origChan := e.acceptedChan
-		e.acceptedChan = make(chan *endpoint, backlog)
-		close(origChan)
-		for ep := range origChan {
-			e.acceptedChan <- ep
+		if e.acceptedChan == nil {
+			// listen is called after shutdown.
+			e.acceptedChan = make(chan *endpoint, backlog)
+			e.shutdownFlags = 0
+			e.rcvListMu.Lock()
+			e.rcvClosed = false
+			e.rcvListMu.Unlock()
+		} else {
+			// Adjust the size of the channel iff we can fix
+			// existing pending connections into the new one.
+			if len(e.acceptedChan) > backlog {
+				return tcpip.ErrInvalidEndpointState
+			}
+			if cap(e.acceptedChan) == backlog {
+				return nil
+			}
+			origChan := e.acceptedChan
+			e.acceptedChan = make(chan *endpoint, backlog)
+			close(origChan)
+			for ep := range origChan {
+				e.acceptedChan <- ep
+			}
 		}
 
 		// Notify any blocked goroutines that they can attempt to
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index c3c692555..8b7562396 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -247,6 +247,11 @@ func (e *endpoint) Resume(s *stack.Stack) {
 			if err := e.Listen(backlog); err != nil {
 				panic("endpoint listening failed: " + err.String())
 			}
+			e.LockUser()
+			if e.shutdownFlags != 0 {
+				e.shutdownLocked(e.shutdownFlags)
+			}
+			e.UnlockUser()
 			listenLoading.Done()
 			tcpip.AsyncLoading.Done()
 		}()
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index d3000dbc6..9400ffaeb 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -319,6 +319,49 @@ TEST_P(SocketInetLoopbackTest, TCPListenUnbound) {
   tcpSimpleConnectTest(listener, connector, false);
 }
 
+TEST_P(SocketInetLoopbackTest, TCPListenShutdownListen) {
+  const auto& param = GetParam();
+
+  const TestAddress& listener = param.listener;
+  const TestAddress& connector = param.connector;
+
+  constexpr int kBacklog = 5;
+
+  // Create the listening socket.
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+  ASSERT_THAT(shutdown(listen_fd.get(), SHUT_RD), SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  const uint16_t port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+
+  for (int i = 0; i < kBacklog; i++) {
+    auto client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                        connector.addr_len),
+                SyscallSucceeds());
+  }
+  for (int i = 0; i < kBacklog; i++) {
+    ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr), SyscallSucceeds());
+  }
+}
+
 TEST_P(SocketInetLoopbackTest, TCPListenShutdown) {
   auto const& param = GetParam();
 
-- 
cgit v1.2.3