From ebd25099bfb9ac6af9739dd9a7795aff13f8e34a Mon Sep 17 00:00:00 2001
From: Bin Lu <bin.lu@arm.com>
Date: Fri, 10 Jan 2020 16:45:45 +0800
Subject: enable //test/syscalls:proc_test support on Arm64

Problems with different platform architectures have been solved.

Signed-off-by: Bin Lu <bin.lu@arm.com>
---
 test/syscalls/linux/proc.cc | 70 +++++++++++++++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 22 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 8cf08991b..66f89ef64 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -102,7 +102,55 @@ namespace {
 
 // O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
 // because "it isn't needed", even though Linux can return it via F_GETFL.
+#if defined(__x86_64__) || defined(__i386__)
 constexpr int kOLargeFile = 00100000;
+#elif __aarch64__
+// The value originate from the Linux
+// kernel's arch/arm64/include/uapi/asm/fcntl.h.
+constexpr int kOLargeFile = 00400000;
+#else
+#error "Unknown architecture"
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
+  // This list of "required" fields is taken from reading the file
+  // arch/x86/kernel/cpu/proc.c and seeing which fields will be unconditionally
+  // printed by the kernel.
+  static const char* required_fields[] = {
+      "processor",
+      "vendor_id",
+      "cpu family",
+      "model\t\t:",
+      "model name",
+      "stepping",
+      "cpu MHz",
+      "fpu\t\t:",
+      "fpu_exception",
+      "cpuid level",
+      "wp",
+      "bogomips",
+      "clflush size",
+      "cache_alignment",
+      "address sizes",
+      "power management",
+  };
+#elif __aarch64__
+  // This list of "required" fields is taken from reading the file
+  // arch/arm64/kernel/cpuinfo.c and seeing which fields will be unconditionally
+  // printed by the kernel.
+  static const char* required_fields[] = {
+      "processor",
+      "BogoMIPS",
+      "Features",
+      "CPU implementer",
+      "CPU architecture",
+      "CPU variant",
+      "CPU part",
+      "CPU revision",
+  };
+#else
+#error "Unknown architecture"
+#endif
 
 // Takes the subprocess command line and pid.
 // If it returns !OK, WithSubprocess returns immediately.
@@ -717,28 +765,6 @@ TEST(ProcCpuinfo, RequiredFieldsArePresent) {
   ASSERT_FALSE(proc_cpuinfo.empty());
   std::vector<std::string> cpuinfo_fields = absl::StrSplit(proc_cpuinfo, '\n');
 
-  // This list of "required" fields is taken from reading the file
-  // arch/x86/kernel/cpu/proc.c and seeing which fields will be unconditionally
-  // printed by the kernel.
-  static const char* required_fields[] = {
-      "processor",
-      "vendor_id",
-      "cpu family",
-      "model\t\t:",
-      "model name",
-      "stepping",
-      "cpu MHz",
-      "fpu\t\t:",
-      "fpu_exception",
-      "cpuid level",
-      "wp",
-      "bogomips",
-      "clflush size",
-      "cache_alignment",
-      "address sizes",
-      "power management",
-  };
-
   // Check that the usual fields are there. We don't really care about the
   // contents.
   for (const std::string& field : required_fields) {
-- 
cgit v1.2.3


From debd213da61cf35d7c91346820e93fc87bfa5896 Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Mon, 13 Jan 2020 14:45:31 -0800
Subject: Allow dual stack sockets to operate on AF_INET

Fixes #1490
Fixes #1495

PiperOrigin-RevId: 289523250
---
 pkg/sentry/socket/netstack/netstack.go      |  65 +++++++++---
 pkg/sentry/socket/unix/unix.go              |   5 +-
 pkg/sentry/strace/socket.go                 |   2 +-
 pkg/tcpip/stack/stack.go                    |  43 ++++++++
 pkg/tcpip/transport/icmp/endpoint.go        |  22 ++--
 pkg/tcpip/transport/tcp/endpoint.go         |  23 +---
 pkg/tcpip/transport/udp/endpoint.go         |  41 ++------
 scripts/common.sh                           |   2 +-
 test/syscalls/linux/BUILD                   |   1 +
 test/syscalls/linux/socket_inet_loopback.cc | 156 ++++++++++++++++++++++++++++
 10 files changed, 278 insertions(+), 82 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 099319327..c020c11cb 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -324,22 +324,15 @@ func bytesToIPAddress(addr []byte) tcpip.Address {
 // converts it to the FullAddress format. It supports AF_UNIX, AF_INET,
 // AF_INET6, and AF_PACKET addresses.
 //
-// strict indicates whether addresses with the AF_UNSPEC family are accepted of not.
-//
 // AddressAndFamily returns an address and its family.
-func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) {
+func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
 	// Make sure we have at least 2 bytes for the address family.
 	if len(addr) < 2 {
 		return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument
 	}
 
-	family := usermem.ByteOrder.Uint16(addr)
-	if family != uint16(sfamily) && (strict || family != linux.AF_UNSPEC) {
-		return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
-	}
-
 	// Get the rest of the fields based on the address family.
-	switch family {
+	switch family := usermem.ByteOrder.Uint16(addr); family {
 	case linux.AF_UNIX:
 		path := addr[2:]
 		if len(path) > linux.UnixPathMax {
@@ -638,10 +631,40 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
 	return r
 }
 
+func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error {
+	if family == uint16(s.family) {
+		return nil
+	}
+	if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 {
+		v, err := s.Endpoint.GetSockOptBool(tcpip.V6OnlyOption)
+		if err != nil {
+			return syserr.TranslateNetstackError(err)
+		}
+		if !v {
+			return nil
+		}
+	}
+	return syserr.ErrInvalidArgument
+}
+
+// mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the
+// receiver's family is AF_INET6.
+//
+// This is a hack to work around the fact that both IPv4 and IPv6 ANY are
+// represented by the empty string.
+//
+// TODO(gvisor.dev/issues/1556): remove this function.
+func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
+	if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
+		addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
+	}
+	return addr
+}
+
 // Connect implements the linux syscall connect(2) for sockets backed by
 // tpcip.Endpoint.
 func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
-	addr, family, err := AddressAndFamily(s.family, sockaddr, false /* strict */)
+	addr, family, err := AddressAndFamily(sockaddr)
 	if err != nil {
 		return err
 	}
@@ -653,6 +676,12 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 		}
 		return syserr.TranslateNetstackError(err)
 	}
+
+	if err := s.checkFamily(family, false /* exact */); err != nil {
+		return err
+	}
+	addr = s.mapFamily(addr, family)
+
 	// Always return right away in the non-blocking case.
 	if !blocking {
 		return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
@@ -681,10 +710,14 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 // Bind implements the linux syscall bind(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
-	addr, _, err := AddressAndFamily(s.family, sockaddr, true /* strict */)
+	addr, family, err := AddressAndFamily(sockaddr)
 	if err != nil {
 		return err
 	}
+	if err := s.checkFamily(family, true /* exact */); err != nil {
+		return err
+	}
+	addr = s.mapFamily(addr, family)
 
 	// Issue the bind request to the endpoint.
 	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
@@ -2080,8 +2113,8 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32)
 
 	case linux.AF_INET6:
 		var out linux.SockAddrInet6
-		if len(addr.Addr) == 4 {
-			// Copy address is v4-mapped format.
+		if len(addr.Addr) == header.IPv4AddressSize {
+			// Copy address in v4-mapped format.
 			copy(out.Addr[12:], addr.Addr)
 			out.Addr[10] = 0xff
 			out.Addr[11] = 0xff
@@ -2395,10 +2428,14 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 
 	var addr *tcpip.FullAddress
 	if len(to) > 0 {
-		addrBuf, _, err := AddressAndFamily(s.family, to, true /* strict */)
+		addrBuf, family, err := AddressAndFamily(to)
 		if err != nil {
 			return 0, err
 		}
+		if err := s.checkFamily(family, false /* exact */); err != nil {
+			return 0, err
+		}
+		addrBuf = s.mapFamily(addrBuf, family)
 
 		addr = &addrBuf
 	}
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 91effe89a..7f49ba864 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -116,13 +116,16 @@ func (s *SocketOperations) Endpoint() transport.Endpoint {
 
 // extractPath extracts and validates the address.
 func extractPath(sockaddr []byte) (string, *syserr.Error) {
-	addr, _, err := netstack.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */)
+	addr, family, err := netstack.AddressAndFamily(sockaddr)
 	if err != nil {
 		if err == syserr.ErrAddressFamilyNotSupported {
 			err = syserr.ErrInvalidArgument
 		}
 		return "", err
 	}
+	if family != linux.AF_UNIX {
+		return "", syserr.ErrInvalidArgument
+	}
 
 	// The address is trimmed by GetAddress.
 	p := string(addr.Addr)
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 51f2efb39..b6d7177f4 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -341,7 +341,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string {
 
 	switch family {
 	case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX:
-		fa, _, err := netstack.AddressAndFamily(int(family), b, true /* strict */)
+		fa, _, err := netstack.AddressAndFamily(b)
 		if err != nil {
 			return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err)
 		}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index a47ceba54..113b457fb 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -547,6 +547,49 @@ type TransportEndpointInfo struct {
 	RegisterNICID tcpip.NICID
 }
 
+// AddrNetProto unwraps the specified address if it is a V4-mapped V6 address
+// and returns the network protocol number to be used to communicate with the
+// specified address. It returns an error if the passed address is incompatible
+// with the receiver.
+func (e *TransportEndpointInfo) AddrNetProto(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	netProto := e.NetProto
+	switch len(addr.Addr) {
+	case header.IPv4AddressSize:
+		netProto = header.IPv4ProtocolNumber
+	case header.IPv6AddressSize:
+		if header.IsV4MappedAddress(addr.Addr) {
+			netProto = header.IPv4ProtocolNumber
+			addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
+			if addr.Addr == header.IPv4Any {
+				addr.Addr = ""
+			}
+		}
+	}
+
+	switch len(e.ID.LocalAddress) {
+	case header.IPv4AddressSize:
+		if len(addr.Addr) == header.IPv6AddressSize {
+			return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
+		}
+	case header.IPv6AddressSize:
+		if len(addr.Addr) == header.IPv4AddressSize {
+			return tcpip.FullAddress{}, 0, tcpip.ErrNetworkUnreachable
+		}
+	}
+
+	switch {
+	case netProto == e.NetProto:
+	case netProto == header.IPv4ProtocolNumber && e.NetProto == header.IPv6ProtocolNumber:
+		if v6only {
+			return tcpip.FullAddress{}, 0, tcpip.ErrNoRoute
+		}
+	default:
+		return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
+	}
+
+	return addr, netProto, nil
+}
+
 // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
 // marker interface.
 func (*TransportEndpointInfo) IsEndpointInfo() {}
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 330786f4c..42afb3f5b 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -288,7 +288,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 		toCopy := *to
 		to = &toCopy
-		netProto, err := e.checkV4Mapped(to, true)
+		netProto, err := e.checkV4Mapped(to)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -475,18 +475,12 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	})
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.NetProto
-	if header.IsV4MappedAddress(addr.Addr) {
-		return 0, tcpip.ErrNoRoute
-	}
-
-	// Fail if we're bound to an address length different from the one we're
-	// checking.
-	if l := len(e.ID.LocalAddress); !allowMismatch && l != 0 && l != len(addr.Addr) {
-		return 0, tcpip.ErrInvalidEndpointState
+func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, false /* v6only */)
+	if err != nil {
+		return 0, err
 	}
-
+	*addr = unwrapped
 	return netProto, nil
 }
 
@@ -518,7 +512,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr, false)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
@@ -631,7 +625,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr, false)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index cca511fb9..cc8b533c8 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1691,26 +1691,11 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 }
 
 func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.NetProto
-	if header.IsV4MappedAddress(addr.Addr) {
-		// Fail if using a v4 mapped address on a v6only endpoint.
-		if e.v6only {
-			return 0, tcpip.ErrNoRoute
-		}
-
-		netProto = header.IPv4ProtocolNumber
-		addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
-		if addr.Addr == header.IPv4Any {
-			addr.Addr = ""
-		}
-	}
-
-	// Fail if we're bound to an address length different from the one we're
-	// checking.
-	if l := len(e.ID.LocalAddress); l != 0 && len(addr.Addr) != 0 && l != len(addr.Addr) {
-		return 0, tcpip.ErrInvalidEndpointState
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+	if err != nil {
+		return 0, err
 	}
-
+	*addr = unwrapped
 	return netProto, nil
 }
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index a4ff29a7d..13446f5d9 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -402,7 +402,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			return 0, nil, tcpip.ErrBroadcastDisabled
 		}
 
-		netProto, err := e.checkV4Mapped(to, false)
+		netProto, err := e.checkV4Mapped(to)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -501,7 +501,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		defer e.mu.Unlock()
 
 		fa := tcpip.FullAddress{Addr: v.InterfaceAddr}
-		netProto, err := e.checkV4Mapped(&fa, false)
+		netProto, err := e.checkV4Mapped(&fa)
 		if err != nil {
 			return err
 		}
@@ -839,35 +839,12 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	return nil
 }
 
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	netProto := e.NetProto
-	if len(addr.Addr) == 0 {
-		return netProto, nil
-	}
-	if header.IsV4MappedAddress(addr.Addr) {
-		// Fail if using a v4 mapped address on a v6only endpoint.
-		if e.v6only {
-			return 0, tcpip.ErrNoRoute
-		}
-
-		netProto = header.IPv4ProtocolNumber
-		addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
-		if addr.Addr == header.IPv4Any {
-			addr.Addr = ""
-		}
-
-		// Fail if we are bound to an IPv6 address.
-		if !allowMismatch && len(e.ID.LocalAddress) == 16 {
-			return 0, tcpip.ErrNetworkUnreachable
-		}
-	}
-
-	// Fail if we're bound to an address length different from the one we're
-	// checking.
-	if l := len(e.ID.LocalAddress); l != 0 && l != len(addr.Addr) {
-		return 0, tcpip.ErrInvalidEndpointState
+func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+	if err != nil {
+		return 0, err
 	}
-
+	*addr = unwrapped
 	return netProto, nil
 }
 
@@ -916,7 +893,7 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 
 // Connect connects the endpoint to its peer. Specifying a NIC is optional.
 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
-	netProto, err := e.checkV4Mapped(&addr, false)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
@@ -1074,7 +1051,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr, true)
+	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
 	}
diff --git a/scripts/common.sh b/scripts/common.sh
index 6dabad141..fdb1aa142 100755
--- a/scripts/common.sh
+++ b/scripts/common.sh
@@ -73,7 +73,7 @@ function install_runsc() {
   sudo "${RUNSC_BIN}" install --experimental=true --runtime="${runtime}" -- --debug-log "${RUNSC_LOGS}" "$@"
 
   # Clear old logs files that may exist.
-  sudo rm -f "${RUNSC_LOGS_DIR}"/*
+  sudo rm -f "${RUNSC_LOGS_DIR}"/'*'
 
   # Restart docker to pick up the new runtime configuration.
   sudo systemctl restart docker
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ce8abe217..4c7ec3f06 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2693,6 +2693,7 @@ cc_binary(
     srcs = ["socket_inet_loopback.cc"],
     linkstatic = 1,
     deps = [
+        ":ip_socket_test_util",
         ":socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:posix_error",
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 138024d9e..5d114d460 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -32,6 +32,7 @@
 #include "absl/strings/str_cat.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
@@ -102,6 +103,161 @@ TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) {
               SyscallFailsWithErrno(EAFNOSUPPORT));
 }
 
+enum class Operation {
+  Bind,
+  Connect,
+  SendTo,
+};
+
+std::string OperationToString(Operation operation) {
+  switch (operation) {
+    case Operation::Bind:
+      return "Bind";
+    case Operation::Connect:
+      return "Connect";
+    case Operation::SendTo:
+      return "SendTo";
+  }
+}
+
+using OperationSequence = std::vector<Operation>;
+
+using DualStackSocketTest =
+    ::testing::TestWithParam<std::tuple<TestAddress, OperationSequence>>;
+
+TEST_P(DualStackSocketTest, AddressOperations) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_DGRAM, 0));
+
+  const TestAddress& addr = std::get<0>(GetParam());
+  const OperationSequence& operations = std::get<1>(GetParam());
+
+  auto addr_in = reinterpret_cast<const sockaddr*>(&addr.addr);
+
+  // sockets may only be bound once. Both `connect` and `sendto` cause a socket
+  // to be bound.
+  bool bound = false;
+  for (const Operation& operation : operations) {
+    bool sockname = false;
+    bool peername = false;
+    switch (operation) {
+      case Operation::Bind: {
+        ASSERT_NO_ERRNO(SetAddrPort(
+            addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 0));
+
+        int bind_ret = bind(fd.get(), addr_in, addr.addr_len);
+
+        // Dual stack sockets may only be bound to AF_INET6.
+        if (!bound && addr.family() == AF_INET6) {
+          EXPECT_THAT(bind_ret, SyscallSucceeds());
+          bound = true;
+
+          sockname = true;
+        } else {
+          EXPECT_THAT(bind_ret, SyscallFailsWithErrno(EINVAL));
+        }
+        break;
+      }
+      case Operation::Connect: {
+        ASSERT_NO_ERRNO(SetAddrPort(
+            addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 1337));
+
+        EXPECT_THAT(connect(fd.get(), addr_in, addr.addr_len),
+                    SyscallSucceeds())
+            << GetAddrStr(addr_in);
+        bound = true;
+
+        sockname = true;
+        peername = true;
+
+        break;
+      }
+      case Operation::SendTo: {
+        const char payload[] = "hello";
+        ASSERT_NO_ERRNO(SetAddrPort(
+            addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 1337));
+
+        ssize_t sendto_ret = sendto(fd.get(), &payload, sizeof(payload), 0,
+                                    addr_in, addr.addr_len);
+
+        EXPECT_THAT(sendto_ret, SyscallSucceedsWithValue(sizeof(payload)));
+        sockname = !bound;
+        bound = true;
+        break;
+      }
+    }
+
+    if (sockname) {
+      sockaddr_storage sock_addr;
+      socklen_t addrlen = sizeof(sock_addr);
+      ASSERT_THAT(getsockname(fd.get(), reinterpret_cast<sockaddr*>(&sock_addr),
+                              &addrlen),
+                  SyscallSucceeds());
+      ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6));
+
+      auto sock_addr_in6 = reinterpret_cast<const sockaddr_in6*>(&sock_addr);
+
+      if (operation == Operation::SendTo) {
+        EXPECT_EQ(sock_addr_in6->sin6_family, AF_INET6);
+        EXPECT_TRUE(IN6_IS_ADDR_UNSPECIFIED(sock_addr_in6->sin6_addr.s6_addr32))
+            << OperationToString(operation) << " getsocknam="
+            << GetAddrStr(reinterpret_cast<sockaddr*>(&sock_addr));
+
+        EXPECT_NE(sock_addr_in6->sin6_port, 0);
+      } else if (IN6_IS_ADDR_V4MAPPED(
+                     reinterpret_cast<const sockaddr_in6*>(addr_in)
+                         ->sin6_addr.s6_addr32)) {
+        EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED(sock_addr_in6->sin6_addr.s6_addr32))
+            << OperationToString(operation) << " getsocknam="
+            << GetAddrStr(reinterpret_cast<sockaddr*>(&sock_addr));
+      }
+    }
+
+    if (peername) {
+      sockaddr_storage peer_addr;
+      socklen_t addrlen = sizeof(peer_addr);
+      ASSERT_THAT(getpeername(fd.get(), reinterpret_cast<sockaddr*>(&peer_addr),
+                              &addrlen),
+                  SyscallSucceeds());
+      ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6));
+
+      if (addr.family() == AF_INET ||
+          IN6_IS_ADDR_V4MAPPED(reinterpret_cast<const sockaddr_in6*>(addr_in)
+                                   ->sin6_addr.s6_addr32)) {
+        EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED(
+            reinterpret_cast<const sockaddr_in6*>(&peer_addr)
+                ->sin6_addr.s6_addr32))
+            << OperationToString(operation) << " getpeername="
+            << GetAddrStr(reinterpret_cast<sockaddr*>(&peer_addr));
+      }
+    }
+  }
+}
+
+// TODO(gvisor.dev/issues/1556): uncomment V4MappedAny.
+INSTANTIATE_TEST_SUITE_P(
+    All, DualStackSocketTest,
+    ::testing::Combine(
+        ::testing::Values(V4Any(), V4Loopback(), /*V4MappedAny(),*/
+                          V4MappedLoopback(), V6Any(), V6Loopback()),
+        ::testing::ValuesIn<OperationSequence>(
+            {{Operation::Bind, Operation::Connect, Operation::SendTo},
+             {Operation::Bind, Operation::SendTo, Operation::Connect},
+             {Operation::Connect, Operation::Bind, Operation::SendTo},
+             {Operation::Connect, Operation::SendTo, Operation::Bind},
+             {Operation::SendTo, Operation::Bind, Operation::Connect},
+             {Operation::SendTo, Operation::Connect, Operation::Bind}})),
+    [](::testing::TestParamInfo<
+        std::tuple<TestAddress, OperationSequence>> const& info) {
+      const TestAddress& addr = std::get<0>(info.param);
+      const OperationSequence& operations = std::get<1>(info.param);
+      std::string s = addr.description;
+      for (const Operation& operation : operations) {
+        absl::StrAppend(&s, OperationToString(operation));
+      }
+      return s;
+    });
+
 void tcpSimpleConnectTest(TestAddress const& listener,
                           TestAddress const& connector, bool unbound) {
   // Create the listening socket.
-- 
cgit v1.2.3


From 50625cee59aaff834c7968771ab385ad0e7b0e1f Mon Sep 17 00:00:00 2001
From: Tamir Duberstein <tamird@google.com>
Date: Tue, 14 Jan 2020 13:31:52 -0800
Subject: Implement {g,s}etsockopt(IP_RECVTOS) for UDP sockets

PiperOrigin-RevId: 289718534
---
 pkg/sentry/socket/control/control.go         |  2 +-
 pkg/sentry/socket/netstack/netstack.go       | 36 +++++++++++++--
 pkg/tcpip/checker/checker.go                 | 16 +++++++
 pkg/tcpip/stack/nic.go                       |  2 +-
 pkg/tcpip/stack/stack.go                     |  2 +-
 pkg/tcpip/tcpip.go                           |  8 +++-
 pkg/tcpip/transport/udp/endpoint.go          | 40 +++++++++++++++--
 pkg/tcpip/transport/udp/udp_test.go          | 67 ++++++++++++++++++++++++----
 test/syscalls/linux/socket_ip_udp_generic.cc | 40 +++++++++++++++++
 test/syscalls/linux/udp_socket_test_cases.cc |  8 ++--
 10 files changed, 197 insertions(+), 24 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 4301b697c..1684dfc24 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -327,7 +327,7 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte {
 }
 
 // PackTOS packs an IP_TOS socket control message.
-func PackTOS(t *kernel.Task, tos int8, buf []byte) []byte {
+func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
 	return putCmsgStruct(
 		buf,
 		linux.SOL_IP,
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index c020c11cb..d2f7e987d 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1268,11 +1268,11 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		if err != nil {
 			return nil, syserr.TranslateNetstackError(err)
 		}
-		var o uint32
+		var o int32
 		if v {
 			o = 1
 		}
-		return int32(o), nil
+		return o, nil
 
 	case linux.IPV6_PATHMTU:
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1377,6 +1377,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
 		}
 		return int32(v), nil
 
+	case linux.IP_RECVTOS:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+		var o int32
+		if v {
+			o = 1
+		}
+		return o, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1895,6 +1910,13 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
 
+	case linux.IP_RECVTOS:
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0))
+
 	case linux.IP_ADD_SOURCE_MEMBERSHIP,
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
@@ -1915,7 +1937,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_RECVFRAGSIZE,
 		linux.IP_RECVOPTS,
 		linux.IP_RECVORIGDSTADDR,
-		linux.IP_RECVTOS,
 		linux.IP_RECVTTL,
 		linux.IP_RETOPTS,
 		linux.IP_TRANSPARENT,
@@ -2335,7 +2356,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 }
 
 func (s *SocketOperations) controlMessages() socket.ControlMessages {
-	return socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, Timestamp: s.readCM.Timestamp}}
+	return socket.ControlMessages{
+		IP: tcpip.ControlMessages{
+			HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp,
+			Timestamp:    s.readCM.Timestamp,
+			HasTOS:       s.readCM.HasTOS,
+			TOS:          s.readCM.TOS,
+		},
+	}
 }
 
 // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 2f15bf1f1..542abc99d 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -33,6 +33,9 @@ type NetworkChecker func(*testing.T, []header.Network)
 // TransportChecker is a function to check a property of a transport packet.
 type TransportChecker func(*testing.T, header.Transport)
 
+// ControlMessagesChecker is a function to check a property of ancillary data.
+type ControlMessagesChecker func(*testing.T, tcpip.ControlMessages)
+
 // IPv4 checks the validity and properties of the given IPv4 packet. It is
 // expected to be used in conjunction with other network checkers for specific
 // properties. For example, to check the source and destination address, one
@@ -158,6 +161,19 @@ func FragmentFlags(flags uint8) NetworkChecker {
 	}
 }
 
+// ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
+func ReceiveTOS(want uint8) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTOS {
+			t.Fatalf("got cm.HasTOS = %t, want cm.TOS = %d", cm.HasTOS, want)
+		}
+		if got := cm.TOS; got != want {
+			t.Fatalf("got cm.TOS = %d, want %d", got, want)
+		}
+	}
+}
+
 // TOS creates a checker that checks the TOS field.
 func TOS(tos uint8, label uint32) NetworkChecker {
 	return func(t *testing.T, h []header.Network) {
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index abf73fe33..071221d5a 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -763,7 +763,7 @@ func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
 	n.mu.Unlock()
 }
 
-// Subnets returns the Subnets associated with this NIC.
+// AddressRanges returns the Subnets associated with this NIC.
 func (n *NIC) AddressRanges() []tcpip.Subnet {
 	n.mu.RLock()
 	defer n.mu.RUnlock()
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index f8d89248e..386eb6eec 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -912,7 +912,7 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
 	return false
 }
 
-// NICSubnets returns a map of NICIDs to their associated subnets.
+// NICAddressRanges returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 4a090ac86..b7813cbc0 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -322,7 +322,7 @@ type ControlMessages struct {
 	HasTOS bool
 
 	// TOS is the IPv4 type of service of the associated packet.
-	TOS int8
+	TOS uint8
 
 	// HasTClass indicates whether Tclass is valid/set.
 	HasTClass bool
@@ -500,9 +500,13 @@ type WriteOptions struct {
 type SockOptBool int
 
 const (
+	// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
+	// ancillary message is passed with incoming packets.
+	ReceiveTOSOption SockOptBool = iota
+
 	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
 	// socket is to be restricted to sending and receiving IPv6 packets only.
-	V6OnlyOption SockOptBool = iota
+	V6OnlyOption
 )
 
 // SockOptInt represents socket options which values have the int type.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 13446f5d9..c9cbed8f4 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -31,6 +31,7 @@ type udpPacket struct {
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
+	tos           uint8
 }
 
 // EndpointState represents the state of a UDP endpoint.
@@ -113,6 +114,10 @@ type endpoint struct {
 	// applied while sending packets. Defaults to 0 as on Linux.
 	sendTOS uint8
 
+	// receiveTOS determines if the incoming IPv4 TOS header field is passed
+	// as ancillary data to ControlMessages on Read.
+	receiveTOS bool
+
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -243,7 +248,18 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		*addr = p.senderAddress
 	}
 
-	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
+	cm := tcpip.ControlMessages{
+		HasTimestamp: true,
+		Timestamp:    p.timestamp,
+	}
+	e.mu.RLock()
+	receiveTOS := e.receiveTOS
+	e.mu.RUnlock()
+	if receiveTOS {
+		cm.HasTOS = true
+		cm.TOS = p.tos
+	}
+	return p.data.ToView(), cm, nil
 }
 
 // prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -458,6 +474,12 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 	switch opt {
+	case tcpip.ReceiveTOSOption:
+		e.mu.Lock()
+		e.receiveTOS = v
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
@@ -664,15 +686,21 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
 func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	switch opt {
+	case tcpip.ReceiveTOSOption:
+		e.mu.RLock()
+		v := e.receiveTOS
+		e.mu.RUnlock()
+		return v, nil
+
 	case tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.NetProto != header.IPv6ProtocolNumber {
 			return false, tcpip.ErrUnknownProtocolOption
 		}
 
-		e.mu.Lock()
+		e.mu.RLock()
 		v := e.v6only
-		e.mu.Unlock()
+		e.mu.RUnlock()
 
 		return v, nil
 	}
@@ -1215,6 +1243,12 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	e.rcvList.PushBack(packet)
 	e.rcvBufSize += pkt.Data.Size()
 
+	// Save any useful information from the network header to the packet.
+	switch r.NetProto {
+	case header.IPv4ProtocolNumber:
+		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+	}
+
 	packet.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 0a82bc4fa..ee9d10555 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -56,6 +56,7 @@ const (
 	multicastAddr   = "\xe8\x2b\xd3\xea"
 	multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	broadcastAddr   = header.IPv4Broadcast
+	testTOS         = 0x80
 
 	// defaultMTU is the MTU, in bytes, used throughout the tests, except
 	// where another value is explicitly used. It is chosen to match the MTU
@@ -453,6 +454,7 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 	ip := header.IPv4(buf)
 	ip.Encode(&header.IPv4Fields{
 		IHL:         header.IPv4MinimumSize,
+		TOS:         testTOS,
 		TotalLength: uint16(len(buf)),
 		TTL:         65,
 		Protocol:    uint8(udp.ProtocolNumber),
@@ -552,8 +554,8 @@ func TestBindToDeviceOption(t *testing.T) {
 // testReadInternal sends a packet of the given test flow into the stack by
 // injecting it into the link endpoint. It then attempts to read it from the
 // UDP endpoint and depending on if this was expected to succeed verifies its
-// correctness.
-func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool) {
+// correctness including any additional checker functions provided.
+func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool, checkers ...checker.ControlMessagesChecker) {
 	c.t.Helper()
 
 	payload := newPayload()
@@ -568,12 +570,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
 
 	var addr tcpip.FullAddress
-	v, _, err := c.ep.Read(&addr)
+	v, cm, err := c.ep.Read(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for data to become available.
 		select {
 		case <-ch:
-			v, _, err = c.ep.Read(&addr)
+			v, cm, err = c.ep.Read(&addr)
 
 		case <-time.After(300 * time.Millisecond):
 			if packetShouldBeDropped {
@@ -606,15 +608,21 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
 	if !bytes.Equal(payload, v) {
 		c.t.Fatalf("bad payload: got %x, want %x", v, payload)
 	}
+
+	// Run any checkers against the ControlMessages.
+	for _, f := range checkers {
+		f(c.t, cm)
+	}
+
 	c.checkEndpointReadStats(1, epstats, err)
 }
 
 // testRead sends a packet of the given test flow into the stack by injecting it
 // into the link endpoint. It then reads it from the UDP endpoint and verifies
-// its correctness.
-func testRead(c *testContext, flow testFlow) {
+// its correctness including any additional checker functions provided.
+func testRead(c *testContext, flow testFlow, checkers ...checker.ControlMessagesChecker) {
 	c.t.Helper()
-	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */)
+	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */, checkers...)
 }
 
 // testFailingRead sends a packet of the given test flow into the stack by
@@ -1282,7 +1290,7 @@ func TestTOSV4(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = 0xC0
+			const tos = testTOS
 			var v tcpip.IPv4TOSOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1317,7 +1325,7 @@ func TestTOSV6(t *testing.T) {
 
 			c.createEndpointForFlow(flow)
 
-			const tos = 0xC0
+			const tos = testTOS
 			var v tcpip.IPv6TrafficClassOption
 			if err := c.ep.GetSockOpt(&v); err != nil {
 				c.t.Errorf("GetSockopt failed: %s", err)
@@ -1344,6 +1352,47 @@ func TestTOSV6(t *testing.T) {
 	}
 }
 
+func TestReceiveTOSV4(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, broadcast} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Verify that setting and reading the option works.
+			v, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+			if err != nil {
+				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
+			}
+			// Test for expected default value.
+			if v != false {
+				c.t.Errorf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", v, false)
+			}
+
+			want := true
+			if err := c.ep.SetSockOptBool(tcpip.ReceiveTOSOption, want); err != nil {
+				c.t.Fatalf("SetSockOptBool(tcpip.ReceiveTOSOption, %t) failed: %s", want, err)
+			}
+
+			got, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+			if err != nil {
+				c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
+			}
+			if got != want {
+				c.t.Fatalf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", got, want)
+			}
+
+			// Verify that the correct received TOS is handed through as
+			// ancillary data to the ControlMessages struct.
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				c.t.Fatal("Bind failed:", err)
+			}
+			testRead(c, flow, checker.ReceiveTOS(testTOS))
+		})
+	}
+}
+
 func TestMulticastInterfaceOption(t *testing.T) {
 	for _, flow := range []testFlow{multicastV4, multicastV4in6, multicastV6, multicastV6Only} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 66eb68857..53290bed7 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -209,6 +209,46 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
+// Ensure that Receiving TOS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS works as expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
 TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index dc35c2f50..68e0a8109 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -1349,8 +1349,9 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
 // outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SetAndReceiveTOS) {
-  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
-  SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
+          !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
@@ -1421,7 +1422,8 @@ TEST_P(UdpSocketTest, SetAndReceiveTOS) {
 // TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
 // IPV6_RECVTCLASS will create the corresponding control message.
 TEST_P(UdpSocketTest, SendAndReceiveTOS) {
-  // TODO(b/68320120): IP_RECVTOS/IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+  // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
   SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-- 
cgit v1.2.3


From a611fdaee3c14abe2222140ae0a8a742ebfd31ab Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 14 Jan 2020 14:14:17 -0800
Subject: Changes TCP packet dispatch to use a pool of goroutines.

All inbound segments for connections in ESTABLISHED state are delivered to the
endpoint's queue but for every segment delivered we also queue the endpoint for
processing to a selected processor. This ensures that when there are a large
number of connections in ESTABLISHED state the inbound packets are all handled
by a small number of goroutines and significantly reduces the amount of work the
goscheduler has to perform.

We let connections in other states follow the current path where the
endpoint's goroutine directly handles the segments.

Updates #231

PiperOrigin-RevId: 289728325
---
 benchmarks/tcp/tcp_proxy.go                 |   6 +-
 pkg/sleep/sleep_test.go                     |  31 +++
 pkg/tcpip/stack/transport_demuxer.go        |  54 ++++-
 pkg/tcpip/transport/tcp/BUILD               |  15 +-
 pkg/tcpip/transport/tcp/accept.go           |   9 +-
 pkg/tcpip/transport/tcp/connect.go          | 310 ++++++++++++++++------------
 pkg/tcpip/transport/tcp/dispatcher.go       | 218 +++++++++++++++++++
 pkg/tcpip/transport/tcp/endpoint.go         | 303 ++++++++++++++++++---------
 pkg/tcpip/transport/tcp/endpoint_state.go   |  30 +--
 pkg/tcpip/transport/tcp/protocol.go         |  11 +
 pkg/tcpip/transport/tcp/rcv.go              |  21 +-
 pkg/tcpip/transport/tcp/snd.go              |  14 +-
 pkg/tcpip/transport/tcp/tcp_test.go         |  11 +-
 test/syscalls/linux/socket_inet_loopback.cc |   2 +-
 test/syscalls/linux/tcp_socket.cc           |  14 ++
 15 files changed, 769 insertions(+), 280 deletions(-)
 create mode 100644 pkg/tcpip/transport/tcp/dispatcher.go

(limited to 'test/syscalls/linux')

diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
index be0d7bdd6..dc96add66 100644
--- a/benchmarks/tcp/tcp_proxy.go
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -85,7 +85,7 @@ func (netImpl) printStats() {
 
 const (
 	nicID      = 1       // Fixed.
-	rcvBufSize = 1 << 20 // 1MB.
+	rcvBufSize = 4 << 20 // 1MB.
 )
 
 type netstackImpl struct {
@@ -130,6 +130,10 @@ func setupNetwork(ifaceName string, numChannels int) (fds []int, err error) {
 				return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err)
 			}
 
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, rcvBufSize); err != nil {
+				return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", rcvBufSize, err)
+			}
+
 			if !*swgso && *gso != 0 {
 				if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
 					return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
diff --git a/pkg/sleep/sleep_test.go b/pkg/sleep/sleep_test.go
index 130806c86..af47e2ba1 100644
--- a/pkg/sleep/sleep_test.go
+++ b/pkg/sleep/sleep_test.go
@@ -376,6 +376,37 @@ func TestRace(t *testing.T) {
 	}
 }
 
+// TestRaceInOrder tests that multiple wakers can continuously send wake requests to
+// the sleeper and that the wakers are retrieved in the order asserted.
+func TestRaceInOrder(t *testing.T) {
+	const wakers = 100
+	const wakeRequests = 10000
+
+	w := make([]Waker, wakers)
+	s := Sleeper{}
+
+	// Associate each waker and start goroutines that will assert them.
+	for i := range w {
+		s.AddWaker(&w[i], i)
+	}
+	go func() {
+		n := 0
+		for n < wakeRequests {
+			wk := w[n%len(w)]
+			wk.Assert()
+			n++
+		}
+	}()
+
+	// Wait for all wake up notifications from all wakers.
+	for i := 0; i < wakeRequests; i++ {
+		v, _ := s.Fetch(true)
+		if got, want := v, i%wakers; got != want {
+			t.Fatalf("got  %d want %d", got, want)
+		}
+	}
+}
+
 // BenchmarkSleeperMultiSelect measures how long it takes to fetch a wake up
 // from 4 wakers when at least one is already asserted.
 func BenchmarkSleeperMultiSelect(b *testing.B) {
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index f384a91de..d686e6eb8 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -104,7 +104,14 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, p
 		return
 	}
 	// multiPortEndpoints are guaranteed to have at least one element.
-	selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, pkt)
+	transEP := selectEndpoint(id, mpep, epsByNic.seed)
+	if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue {
+		queuedProtocol.QueuePacket(r, transEP, id, pkt)
+		epsByNic.mu.RUnlock()
+		return
+	}
+
+	transEP.HandlePacket(r, id, pkt)
 	epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
 }
 
@@ -130,7 +137,7 @@ func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpoint
 
 // registerEndpoint returns true if it succeeds. It fails and returns
 // false if ep already has an element with the same key.
-func (epsByNic *endpointsByNic) registerEndpoint(t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (epsByNic *endpointsByNic) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
 	epsByNic.mu.Lock()
 	defer epsByNic.mu.Unlock()
 
@@ -140,7 +147,7 @@ func (epsByNic *endpointsByNic) registerEndpoint(t TransportEndpoint, reusePort
 	}
 
 	// This is a new binding.
-	multiPortEp := &multiPortEndpoint{}
+	multiPortEp := &multiPortEndpoint{demux: d, netProto: netProto, transProto: transProto}
 	multiPortEp.endpointsMap = make(map[TransportEndpoint]int)
 	multiPortEp.reuse = reusePort
 	epsByNic.endpoints[bindToDevice] = multiPortEp
@@ -168,18 +175,34 @@ func (epsByNic *endpointsByNic) unregisterEndpoint(bindToDevice tcpip.NICID, t T
 // newTransportDemuxer.
 type transportDemuxer struct {
 	// protocol is immutable.
-	protocol map[protocolIDs]*transportEndpoints
+	protocol        map[protocolIDs]*transportEndpoints
+	queuedProtocols map[protocolIDs]queuedTransportProtocol
+}
+
+// queuedTransportProtocol if supported by a protocol implementation will cause
+// the dispatcher to delivery packets to the QueuePacket method instead of
+// calling HandlePacket directly on the endpoint.
+type queuedTransportProtocol interface {
+	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt tcpip.PacketBuffer)
 }
 
 func newTransportDemuxer(stack *Stack) *transportDemuxer {
-	d := &transportDemuxer{protocol: make(map[protocolIDs]*transportEndpoints)}
+	d := &transportDemuxer{
+		protocol:        make(map[protocolIDs]*transportEndpoints),
+		queuedProtocols: make(map[protocolIDs]queuedTransportProtocol),
+	}
 
 	// Add each network and transport pair to the demuxer.
 	for netProto := range stack.networkProtocols {
 		for proto := range stack.transportProtocols {
-			d.protocol[protocolIDs{netProto, proto}] = &transportEndpoints{
+			protoIDs := protocolIDs{netProto, proto}
+			d.protocol[protoIDs] = &transportEndpoints{
 				endpoints: make(map[TransportEndpointID]*endpointsByNic),
 			}
+			qTransProto, isQueued := (stack.transportProtocols[proto].proto).(queuedTransportProtocol)
+			if isQueued {
+				d.queuedProtocols[protoIDs] = qTransProto
+			}
 		}
 	}
 
@@ -209,7 +232,11 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
 //
 // +stateify savable
 type multiPortEndpoint struct {
-	mu           sync.RWMutex `state:"nosave"`
+	mu         sync.RWMutex `state:"nosave"`
+	demux      *transportDemuxer
+	netProto   tcpip.NetworkProtocolNumber
+	transProto tcpip.TransportProtocolNumber
+
 	endpointsArr []TransportEndpoint
 	endpointsMap map[TransportEndpoint]int
 	// reuse indicates if more than one endpoint is allowed.
@@ -258,13 +285,22 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 
 func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
 	ep.mu.RLock()
+	queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
 	for i, endpoint := range ep.endpointsArr {
 		// HandlePacket takes ownership of pkt, so each endpoint needs
 		// its own copy except for the final one.
 		if i == len(ep.endpointsArr)-1 {
+			if mustQueue {
+				queuedProtocol.QueuePacket(r, endpoint, id, pkt)
+				break
+			}
 			endpoint.HandlePacket(r, id, pkt)
 			break
 		}
+		if mustQueue {
+			queuedProtocol.QueuePacket(r, endpoint, id, pkt.Clone())
+			continue
+		}
 		endpoint.HandlePacket(r, id, pkt.Clone())
 	}
 	ep.mu.RUnlock() // Don't use defer for performance reasons.
@@ -357,7 +393,7 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 
 	if epsByNic, ok := eps.endpoints[id]; ok {
 		// There was already a binding.
-		return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
+		return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
 	}
 
 	// This is a new binding.
@@ -367,7 +403,7 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 	}
 	eps.endpoints[id] = epsByNic
 
-	return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
+	return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
 }
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 353bd06f4..0e3ab05ad 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -16,6 +16,18 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "tcp_endpoint_list",
+    out = "tcp_endpoint_list.go",
+    package = "tcp",
+    prefix = "endpoint",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*endpoint",
+        "Linker": "*endpoint",
+    },
+)
+
 go_library(
     name = "tcp",
     srcs = [
@@ -23,6 +35,7 @@ go_library(
         "connect.go",
         "cubic.go",
         "cubic_state.go",
+        "dispatcher.go",
         "endpoint.go",
         "endpoint_state.go",
         "forwarder.go",
@@ -38,6 +51,7 @@ go_library(
         "segment_state.go",
         "snd.go",
         "snd_state.go",
+        "tcp_endpoint_list.go",
         "tcp_segment_list.go",
         "timer.go",
     ],
@@ -45,7 +59,6 @@ go_library(
     imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/log",
         "//pkg/rand",
         "//pkg/sleep",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 1ea996936..1a2e3efa9 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -285,7 +285,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	// listenEP is nil when listenContext is used by tcp.Forwarder.
 	if l.listenEP != nil {
 		l.listenEP.mu.Lock()
-		if l.listenEP.state != StateListen {
+		if l.listenEP.EndpointState() != StateListen {
 			l.listenEP.mu.Unlock()
 			return nil, tcpip.ErrConnectionAborted
 		}
@@ -344,11 +344,12 @@ func (l *listenContext) closeAllPendingEndpoints() {
 // instead.
 func (e *endpoint) deliverAccepted(n *endpoint) {
 	e.mu.Lock()
-	state := e.state
+	state := e.EndpointState()
 	e.pendingAccepted.Add(1)
 	defer e.pendingAccepted.Done()
 	acceptedChan := e.acceptedChan
 	e.mu.Unlock()
+
 	if state == StateListen {
 		acceptedChan <- n
 		e.waiterQueue.Notify(waiter.EventIn)
@@ -562,8 +563,8 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// We do not use transitionToStateEstablishedLocked here as there is
 		// no handshake state available when doing a SYN cookie based accept.
 		n.stack.Stats().TCP.CurrentEstablished.Increment()
-		n.state = StateEstablished
 		n.isConnectNotified = true
+		n.setEndpointState(StateEstablished)
 
 		// Do the delivery in a separate goroutine so
 		// that we don't block the listen loop in case
@@ -596,7 +597,7 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 		// handleSynSegment() from attempting to queue new connections
 		// to the endpoint.
 		e.mu.Lock()
-		e.state = StateClose
+		e.setEndpointState(StateClose)
 
 		// close any endpoints in SYN-RCVD state.
 		ctx.closeAllPendingEndpoints()
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 613ec1775..f3896715b 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -190,7 +190,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
 	h.ep.mu.Lock()
-	h.ep.state = StateSynRecv
+	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
 }
 
@@ -274,14 +274,14 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	// SYN-RCVD state.
 	h.state = handshakeSynRcvd
 	h.ep.mu.Lock()
-	h.ep.state = StateSynRecv
 	ttl := h.ep.ttl
+	h.ep.setEndpointState(StateSynRecv)
 	h.ep.mu.Unlock()
 	synOpts := header.TCPSynOptions{
 		WS:    int(h.effectiveRcvWndScale()),
 		TS:    rcvSynOpts.TS,
 		TSVal: h.ep.timestamp(),
-		TSEcr: h.ep.recentTS,
+		TSEcr: h.ep.recentTimestamp(),
 
 		// We only send SACKPermitted if the other side indicated it
 		// permits SACK. This is not explicitly defined in the RFC but
@@ -341,7 +341,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			WS:            h.rcvWndScale,
 			TS:            h.ep.sendTSOk,
 			TSVal:         h.ep.timestamp(),
-			TSEcr:         h.ep.recentTS,
+			TSEcr:         h.ep.recentTimestamp(),
 			SACKPermitted: h.ep.sackPermitted,
 			MSS:           h.ep.amss,
 		}
@@ -501,7 +501,7 @@ func (h *handshake) execute() *tcpip.Error {
 		WS:            h.rcvWndScale,
 		TS:            true,
 		TSVal:         h.ep.timestamp(),
-		TSEcr:         h.ep.recentTS,
+		TSEcr:         h.ep.recentTimestamp(),
 		SACKPermitted: bool(sackEnabled),
 		MSS:           h.ep.amss,
 	}
@@ -792,7 +792,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
 		// Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
 		offset += header.EncodeNOP(options[offset:])
 		offset += header.EncodeNOP(options[offset:])
-		offset += header.EncodeTSOption(e.timestamp(), uint32(e.recentTS), options[offset:])
+		offset += header.EncodeTSOption(e.timestamp(), e.recentTimestamp(), options[offset:])
 	}
 	if e.sackPermitted && len(sackBlocks) > 0 {
 		offset += header.EncodeNOP(options[offset:])
@@ -811,7 +811,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
 // sendRaw sends a TCP segment to the endpoint's peer.
 func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
 	var sackBlocks []header.SACKBlock
-	if e.state == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
+	if e.EndpointState() == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
 		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
 	}
 	options := e.makeOptions(sackBlocks)
@@ -848,6 +848,9 @@ func (e *endpoint) handleWrite() *tcpip.Error {
 }
 
 func (e *endpoint) handleClose() *tcpip.Error {
+	if !e.EndpointState().connected() {
+		return nil
+	}
 	// Drain the send queue.
 	e.handleWrite()
 
@@ -864,11 +867,7 @@ func (e *endpoint) handleClose() *tcpip.Error {
 func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	// Only send a reset if the connection is being aborted for a reason
 	// other than receiving a reset.
-	if e.state == StateEstablished || e.state == StateCloseWait {
-		e.stack.Stats().TCP.EstablishedResets.Increment()
-		e.stack.Stats().TCP.CurrentEstablished.Decrement()
-	}
-	e.state = StateError
+	e.setEndpointState(StateError)
 	e.HardError = err
 	if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout {
 		// The exact sequence number to be used for the RST is the same as the
@@ -888,9 +887,12 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 }
 
 // completeWorkerLocked is called by the worker goroutine when it's about to
-// exit. It marks the worker as completed and performs cleanup work if requested
-// by Close().
+// exit.
 func (e *endpoint) completeWorkerLocked() {
+	// Worker is terminating(either due to moving to
+	// CLOSED or ERROR state, ensure we release all
+	// registrations port reservations even if the socket
+	// itself is not yet closed by the application.
 	e.workerRunning = false
 	if e.workerCleanup {
 		e.cleanupLocked()
@@ -917,8 +919,7 @@ func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
 		e.rcvAutoParams.prevCopied = int(h.rcvWnd)
 		e.rcvListMu.Unlock()
 	}
-	h.ep.stack.Stats().TCP.CurrentEstablished.Increment()
-	e.state = StateEstablished
+	e.setEndpointState(StateEstablished)
 }
 
 // transitionToStateCloseLocked ensures that the endpoint is
@@ -927,11 +928,12 @@ func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
 // delivered to this endpoint from the demuxer when the endpoint
 // is transitioned to StateClose.
 func (e *endpoint) transitionToStateCloseLocked() {
-	if e.state == StateClose {
+	if e.EndpointState() == StateClose {
 		return
 	}
+	// Mark the endpoint as fully closed for reads/writes.
 	e.cleanupLocked()
-	e.state = StateClose
+	e.setEndpointState(StateClose)
 	e.stack.Stats().TCP.EstablishedClosed.Increment()
 }
 
@@ -946,7 +948,9 @@ func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 		s.decRef()
 		return
 	}
-	ep.(*endpoint).enqueueSegment(s)
+	if ep.(*endpoint).enqueueSegment(s) {
+		ep.(*endpoint).newSegmentWaker.Assert()
+	}
 }
 
 func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
@@ -955,9 +959,8 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 		// except SYN-SENT, all reset (RST) segments are
 		// validated by checking their SEQ-fields." So
 		// we only process it if it's acceptable.
-		s.decRef()
 		e.mu.Lock()
-		switch e.state {
+		switch e.EndpointState() {
 		// In case of a RST in CLOSE-WAIT linux moves
 		// the socket to closed state with an error set
 		// to indicate EPIPE.
@@ -981,103 +984,57 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 			e.transitionToStateCloseLocked()
 			e.HardError = tcpip.ErrAborted
 			e.mu.Unlock()
+			e.notifyProtocolGoroutine(notifyTickleWorker)
 			return false, nil
 		default:
 			e.mu.Unlock()
+			// RFC 793, page 37 states that "in all states
+			// except SYN-SENT, all reset (RST) segments are
+			// validated by checking their SEQ-fields." So
+			// we only process it if it's acceptable.
+
+			// Notify protocol goroutine. This is required when
+			// handleSegment is invoked from the processor goroutine
+			// rather than the worker goroutine.
+			e.notifyProtocolGoroutine(notifyResetByPeer)
 			return false, tcpip.ErrConnectionReset
 		}
 	}
 	return true, nil
 }
 
-// handleSegments pulls segments from the queue and processes them. It returns
-// no error if the protocol loop should continue, an error otherwise.
-func (e *endpoint) handleSegments() *tcpip.Error {
+// handleSegments processes all inbound segments.
+func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error {
 	checkRequeue := true
 	for i := 0; i < maxSegmentsPerWake; i++ {
+		if e.EndpointState() == StateClose || e.EndpointState() == StateError {
+			return nil
+		}
 		s := e.segmentQueue.dequeue()
 		if s == nil {
 			checkRequeue = false
 			break
 		}
 
-		// Invoke the tcp probe if installed.
-		if e.probe != nil {
-			e.probe(e.completeState())
+		cont, err := e.handleSegment(s)
+		if err != nil {
+			s.decRef()
+			e.mu.Lock()
+			e.setEndpointState(StateError)
+			e.HardError = err
+			e.mu.Unlock()
+			return err
 		}
-
-		if s.flagIsSet(header.TCPFlagRst) {
-			if ok, err := e.handleReset(s); !ok {
-				return err
-			}
-		} else if s.flagIsSet(header.TCPFlagSyn) {
-			// See: https://tools.ietf.org/html/rfc5961#section-4.1
-			//   1) If the SYN bit is set, irrespective of the sequence number, TCP
-			//    MUST send an ACK (also referred to as challenge ACK) to the remote
-			//    peer:
-			//
-			//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
-			//
-			//    After sending the acknowledgment, TCP MUST drop the unacceptable
-			//    segment and stop processing further.
-			//
-			// By sending an ACK, the remote peer is challenged to confirm the loss
-			// of the previous connection and the request to start a new connection.
-			// A legitimate peer, after restart, would not have a TCB in the
-			// synchronized state.  Thus, when the ACK arrives, the peer should send
-			// a RST segment back with the sequence number derived from the ACK
-			// field that caused the RST.
-
-			// This RST will confirm that the remote peer has indeed closed the
-			// previous connection.  Upon receipt of a valid RST, the local TCP
-			// endpoint MUST terminate its connection.  The local TCP endpoint
-			// should then rely on SYN retransmission from the remote end to
-			// re-establish the connection.
-
-			e.snd.sendAck()
-		} else if s.flagIsSet(header.TCPFlagAck) {
-			// Patch the window size in the segment according to the
-			// send window scale.
-			s.window <<= e.snd.sndWndScale
-
-			// RFC 793, page 41 states that "once in the ESTABLISHED
-			// state all segments must carry current acknowledgment
-			// information."
-			drop, err := e.rcv.handleRcvdSegment(s)
-			if err != nil {
-				s.decRef()
-				return err
-			}
-			if drop {
-				s.decRef()
-				continue
-			}
-
-			// Now check if the received segment has caused us to transition
-			// to a CLOSED state, if yes then terminate processing and do
-			// not invoke the sender.
-			e.mu.RLock()
-			state := e.state
-			e.mu.RUnlock()
-			if state == StateClose {
-				// When we get into StateClose while processing from the queue,
-				// return immediately and let the protocolMainloop handle it.
-				//
-				// We can reach StateClose only while processing a previous segment
-				// or a notification from the protocolMainLoop (caller goroutine).
-				// This means that with this return, the segment dequeue below can
-				// never occur on a closed endpoint.
-				s.decRef()
-				return nil
-			}
-			e.snd.handleRcvdSegment(s)
+		if !cont {
+			s.decRef()
+			return nil
 		}
-		s.decRef()
 	}
 
-	// If the queue is not empty, make sure we'll wake up in the next
-	// iteration.
-	if checkRequeue && !e.segmentQueue.empty() {
+	// When fastPath is true we don't want to wake up the worker
+	// goroutine. If the endpoint has more segments to process the
+	// dispatcher will call handleSegments again anyway.
+	if !fastPath && checkRequeue && !e.segmentQueue.empty() {
 		e.newSegmentWaker.Assert()
 	}
 
@@ -1086,11 +1043,88 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 		e.snd.sendAck()
 	}
 
-	e.resetKeepaliveTimer(true)
+	e.resetKeepaliveTimer(true /* receivedData */)
 
 	return nil
 }
 
+// handleSegment handles a given segment and notifies the worker goroutine if
+// if the connection should be terminated.
+func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
+	// Invoke the tcp probe if installed.
+	if e.probe != nil {
+		e.probe(e.completeState())
+	}
+
+	if s.flagIsSet(header.TCPFlagRst) {
+		if ok, err := e.handleReset(s); !ok {
+			return false, err
+		}
+	} else if s.flagIsSet(header.TCPFlagSyn) {
+		// See: https://tools.ietf.org/html/rfc5961#section-4.1
+		//   1) If the SYN bit is set, irrespective of the sequence number, TCP
+		//    MUST send an ACK (also referred to as challenge ACK) to the remote
+		//    peer:
+		//
+		//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+		//
+		//    After sending the acknowledgment, TCP MUST drop the unacceptable
+		//    segment and stop processing further.
+		//
+		// By sending an ACK, the remote peer is challenged to confirm the loss
+		// of the previous connection and the request to start a new connection.
+		// A legitimate peer, after restart, would not have a TCB in the
+		// synchronized state.  Thus, when the ACK arrives, the peer should send
+		// a RST segment back with the sequence number derived from the ACK
+		// field that caused the RST.
+
+		// This RST will confirm that the remote peer has indeed closed the
+		// previous connection.  Upon receipt of a valid RST, the local TCP
+		// endpoint MUST terminate its connection.  The local TCP endpoint
+		// should then rely on SYN retransmission from the remote end to
+		// re-establish the connection.
+
+		e.snd.sendAck()
+	} else if s.flagIsSet(header.TCPFlagAck) {
+		// Patch the window size in the segment according to the
+		// send window scale.
+		s.window <<= e.snd.sndWndScale
+
+		// RFC 793, page 41 states that "once in the ESTABLISHED
+		// state all segments must carry current acknowledgment
+		// information."
+		drop, err := e.rcv.handleRcvdSegment(s)
+		if err != nil {
+			return false, err
+		}
+		if drop {
+			return true, nil
+		}
+
+		// Now check if the received segment has caused us to transition
+		// to a CLOSED state, if yes then terminate processing and do
+		// not invoke the sender.
+		e.mu.RLock()
+		state := e.state
+		e.mu.RUnlock()
+		if state == StateClose {
+			// When we get into StateClose while processing from the queue,
+			// return immediately and let the protocolMainloop handle it.
+			//
+			// We can reach StateClose only while processing a previous segment
+			// or a notification from the protocolMainLoop (caller goroutine).
+			// This means that with this return, the segment dequeue below can
+			// never occur on a closed endpoint.
+			s.decRef()
+			return false, nil
+		}
+
+		e.snd.handleRcvdSegment(s)
+	}
+
+	return true, nil
+}
+
 // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
 // keepalive packets periodically when the connection is idle. If we don't hear
 // from the other side after a number of tries, we terminate the connection.
@@ -1160,7 +1194,7 @@ func (e *endpoint) disableKeepaliveTimer() {
 // protocolMainLoop is the main loop of the TCP protocol. It runs in its own
 // goroutine and is responsible for sending segments and handling received
 // segments.
-func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
+func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) *tcpip.Error {
 	var closeTimer *time.Timer
 	var closeWaker sleep.Waker
 
@@ -1182,6 +1216,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		}
 
 		e.mu.Unlock()
+		e.workMu.Unlock()
 		// When the protocol loop exits we should wake up our waiters.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}
@@ -1193,7 +1228,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		initialRcvWnd := e.initialReceiveWindow()
 		h := newHandshake(e, seqnum.Size(initialRcvWnd))
 		e.mu.Lock()
-		h.ep.state = StateSynSent
+		h.ep.setEndpointState(StateSynSent)
 		e.mu.Unlock()
 
 		if err := h.execute(); err != nil {
@@ -1202,12 +1237,11 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			e.lastErrorMu.Unlock()
 
 			e.mu.Lock()
-			e.state = StateError
+			e.setEndpointState(StateError)
 			e.HardError = err
 
 			// Lock released below.
 			epilogue()
-
 			return err
 		}
 	}
@@ -1215,7 +1249,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	e.keepalive.timer.init(&e.keepalive.waker)
 	defer e.keepalive.timer.cleanup()
 
-	// Tell waiters that the endpoint is connected and writable.
 	e.mu.Lock()
 	drained := e.drainDone != nil
 	e.mu.Unlock()
@@ -1224,8 +1257,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		<-e.undrain
 	}
 
-	e.waiterQueue.Notify(waiter.EventOut)
-
 	// Set up the functions that will be called when the main protocol loop
 	// wakes up.
 	funcs := []struct {
@@ -1240,18 +1271,15 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			w: &e.sndCloseWaker,
 			f: e.handleClose,
 		},
-		{
-			w: &e.newSegmentWaker,
-			f: e.handleSegments,
-		},
 		{
 			w: &closeWaker,
 			f: func() *tcpip.Error {
 				// This means the socket is being closed due
-				// to the TCP_FIN_WAIT2 timeout was hit. Just
+				// to the TCP-FIN-WAIT2 timeout was hit. Just
 				// mark the socket as closed.
 				e.mu.Lock()
 				e.transitionToStateCloseLocked()
+				e.workerCleanup = true
 				e.mu.Unlock()
 				return nil
 			},
@@ -1266,6 +1294,12 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 				return nil
 			},
 		},
+		{
+			w: &e.newSegmentWaker,
+			f: func() *tcpip.Error {
+				return e.handleSegments(false /* fastPath */)
+			},
+		},
 		{
 			w: &e.keepalive.waker,
 			f: e.keepaliveTimerExpired,
@@ -1293,14 +1327,16 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 				}
 
 				if n&notifyReset != 0 {
-					e.mu.Lock()
-					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
-					e.mu.Unlock()
+					return tcpip.ErrConnectionAborted
+				}
+
+				if n&notifyResetByPeer != 0 {
+					return tcpip.ErrConnectionReset
 				}
 
 				if n&notifyClose != 0 && closeTimer == nil {
 					e.mu.Lock()
-					if e.state == StateFinWait2 && e.closed {
+					if e.EndpointState() == StateFinWait2 && e.closed {
 						// The socket has been closed and we are in FIN_WAIT2
 						// so start the FIN_WAIT2 timer.
 						closeTimer = time.AfterFunc(e.tcpLingerTimeout, func() {
@@ -1320,11 +1356,11 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 
 				if n&notifyDrain != 0 {
 					for !e.segmentQueue.empty() {
-						if err := e.handleSegments(); err != nil {
+						if err := e.handleSegments(false /* fastPath */); err != nil {
 							return err
 						}
 					}
-					if e.state != StateClose && e.state != StateError {
+					if e.EndpointState() != StateClose && e.EndpointState() != StateError {
 						// Only block the worker if the endpoint
 						// is not in closed state or error state.
 						close(e.drainDone)
@@ -1349,14 +1385,21 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		s.AddWaker(funcs[i].w, i)
 	}
 
+	// Notify the caller that the waker initialization is complete and the
+	// endpoint is ready.
+	if wakerInitDone != nil {
+		close(wakerInitDone)
+	}
+
+	// Tell waiters that the endpoint is connected and writable.
+	e.waiterQueue.Notify(waiter.EventOut)
+
 	// The following assertions and notifications are needed for restored
 	// endpoints. Fresh newly created endpoints have empty states and should
 	// not invoke any.
-	e.segmentQueue.mu.Lock()
-	if !e.segmentQueue.list.Empty() {
+	if !e.segmentQueue.empty() {
 		e.newSegmentWaker.Assert()
 	}
-	e.segmentQueue.mu.Unlock()
 
 	e.rcvListMu.Lock()
 	if !e.rcvList.Empty() {
@@ -1372,27 +1415,32 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
 
-	for e.state != StateTimeWait && e.state != StateClose && e.state != StateError {
+	for e.EndpointState() != StateTimeWait && e.EndpointState() != StateClose && e.EndpointState() != StateError {
 		e.mu.Unlock()
 		e.workMu.Unlock()
 		v, _ := s.Fetch(true)
 		e.workMu.Lock()
+		// We need to double check here because the notification
+		// maybe stale by the time we got around to processing it.
+		// NOTE: since we now hold the workMu the processors cannot
+		// change the state of the endpoint so it' safe to proceed
+		// after this check.
+		if e.EndpointState() == StateTimeWait || e.EndpointState() == StateClose || e.EndpointState() == StateError {
+			e.mu.Lock()
+			break
+		}
 		if err := funcs[v].f(); err != nil {
 			e.mu.Lock()
-			// Ensure we release all endpoint registration and route
-			// references as the connection is now in an error
-			// state.
 			e.workerCleanup = true
 			e.resetConnectionLocked(err)
 			// Lock released below.
 			epilogue()
-
 			return nil
 		}
 		e.mu.Lock()
 	}
 
-	state := e.state
+	state := e.EndpointState()
 	e.mu.Unlock()
 	var reuseTW func()
 	if state == StateTimeWait {
@@ -1405,13 +1453,15 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		s.Done()
 		// Wake up any waiters before we enter TIME_WAIT.
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+		e.mu.Lock()
+		e.workerCleanup = true
+		e.mu.Unlock()
 		reuseTW = e.doTimeWait()
 	}
 
 	// Mark endpoint as closed.
 	e.mu.Lock()
-	if e.state != StateError {
-		e.stack.Stats().TCP.CurrentEstablished.Decrement()
+	if e.EndpointState() != StateError {
 		e.transitionToStateCloseLocked()
 	}
 
@@ -1468,7 +1518,11 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()
 					tcpEP := listenEP.(*endpoint)
 					if EndpointState(tcpEP.State()) == StateListen {
 						reuseTW = func() {
-							tcpEP.enqueueSegment(s)
+							if !tcpEP.enqueueSegment(s) {
+								s.decRef()
+								return
+							}
+							tcpEP.newSegmentWaker.Assert()
 						}
 						// We explicitly do not decRef
 						// the segment as it's still
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
new file mode 100644
index 000000000..a72f0c379
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -0,0 +1,218 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// epQueue is a queue of endpoints.
+type epQueue struct {
+	mu   sync.Mutex
+	list endpointList
+}
+
+// enqueue adds e to the queue if the endpoint is not already on the queue.
+func (q *epQueue) enqueue(e *endpoint) {
+	q.mu.Lock()
+	if e.pendingProcessing {
+		q.mu.Unlock()
+		return
+	}
+	q.list.PushBack(e)
+	e.pendingProcessing = true
+	q.mu.Unlock()
+}
+
+// dequeue removes and returns the first element from the queue if available,
+// returns nil otherwise.
+func (q *epQueue) dequeue() *endpoint {
+	q.mu.Lock()
+	if e := q.list.Front(); e != nil {
+		q.list.Remove(e)
+		e.pendingProcessing = false
+		q.mu.Unlock()
+		return e
+	}
+	q.mu.Unlock()
+	return nil
+}
+
+// empty returns true if the queue is empty, false otherwise.
+func (q *epQueue) empty() bool {
+	q.mu.Lock()
+	v := q.list.Empty()
+	q.mu.Unlock()
+	return v
+}
+
+// processor is responsible for processing packets queued to a tcp endpoint.
+type processor struct {
+	epQ              epQueue
+	newEndpointWaker sleep.Waker
+	id               int
+}
+
+func newProcessor(id int) *processor {
+	p := &processor{
+		id: id,
+	}
+	go p.handleSegments()
+	return p
+}
+
+func (p *processor) queueEndpoint(ep *endpoint) {
+	// Queue an endpoint for processing by the processor goroutine.
+	p.epQ.enqueue(ep)
+	p.newEndpointWaker.Assert()
+}
+
+func (p *processor) handleSegments() {
+	const newEndpointWaker = 1
+	s := sleep.Sleeper{}
+	s.AddWaker(&p.newEndpointWaker, newEndpointWaker)
+	defer s.Done()
+	for {
+		s.Fetch(true)
+		for ep := p.epQ.dequeue(); ep != nil; ep = p.epQ.dequeue() {
+			if ep.segmentQueue.empty() {
+				continue
+			}
+
+			// If socket has transitioned out of connected state
+			// then just let the worker handle the packet.
+			//
+			// NOTE: We read this outside of e.mu lock which means
+			// that by the time we get to handleSegments the
+			// endpoint may not be in ESTABLISHED. But this should
+			// be fine as all normal shutdown states are handled by
+			// handleSegments and if the endpoint moves to a
+			// CLOSED/ERROR state then handleSegments is a noop.
+			if ep.EndpointState() != StateEstablished {
+				ep.newSegmentWaker.Assert()
+				continue
+			}
+
+			if !ep.workMu.TryLock() {
+				ep.newSegmentWaker.Assert()
+				continue
+			}
+			// If the endpoint is in a connected state then we do
+			// direct delivery to ensure low latency and avoid
+			// scheduler interactions.
+			if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose {
+				ep.notifyProtocolGoroutine(notifyTickleWorker)
+				ep.workMu.Unlock()
+				continue
+			}
+
+			if !ep.segmentQueue.empty() {
+				p.epQ.enqueue(ep)
+			}
+
+			ep.workMu.Unlock()
+		}
+	}
+}
+
+// dispatcher manages a pool of TCP endpoint processors which are responsible
+// for the processing of inbound segments. This fixed pool of processor
+// goroutines do full tcp processing. The processor is selected based on the
+// hash of the endpoint id to ensure that delivery for the same endpoint happens
+// in-order.
+type dispatcher struct {
+	processors []*processor
+	seed       uint32
+}
+
+func newDispatcher(nProcessors int) *dispatcher {
+	processors := []*processor{}
+	for i := 0; i < nProcessors; i++ {
+		processors = append(processors, newProcessor(i))
+	}
+	return &dispatcher{
+		processors: processors,
+		seed:       generateRandUint32(),
+	}
+}
+
+func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+	ep := stackEP.(*endpoint)
+	s := newSegment(r, id, pkt)
+	if !s.parse() {
+		ep.stack.Stats().MalformedRcvdPackets.Increment()
+		ep.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
+		ep.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+		s.decRef()
+		return
+	}
+
+	if !s.csumValid {
+		ep.stack.Stats().MalformedRcvdPackets.Increment()
+		ep.stack.Stats().TCP.ChecksumErrors.Increment()
+		ep.stats.ReceiveErrors.ChecksumErrors.Increment()
+		s.decRef()
+		return
+	}
+
+	ep.stack.Stats().TCP.ValidSegmentsReceived.Increment()
+	ep.stats.SegmentsReceived.Increment()
+	if (s.flags & header.TCPFlagRst) != 0 {
+		ep.stack.Stats().TCP.ResetsReceived.Increment()
+	}
+
+	if !ep.enqueueSegment(s) {
+		s.decRef()
+		return
+	}
+
+	// For sockets not in established state let the worker goroutine
+	// handle the packets.
+	if ep.EndpointState() != StateEstablished {
+		ep.newSegmentWaker.Assert()
+		return
+	}
+
+	d.selectProcessor(id).queueEndpoint(ep)
+}
+
+func generateRandUint32() uint32 {
+	b := make([]byte, 4)
+	if _, err := rand.Read(b); err != nil {
+		panic(err)
+	}
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func (d *dispatcher) selectProcessor(id stack.TransportEndpointID) *processor {
+	payload := []byte{
+		byte(id.LocalPort),
+		byte(id.LocalPort >> 8),
+		byte(id.RemotePort),
+		byte(id.RemotePort >> 8)}
+
+	h := jenkins.Sum32(d.seed)
+	h.Write(payload)
+	h.Write([]byte(id.LocalAddress))
+	h.Write([]byte(id.RemoteAddress))
+
+	return d.processors[h.Sum32()%uint32(len(d.processors))]
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index cc8b533c8..1799c6e10 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -120,6 +120,7 @@ const (
 	notifyMTUChanged
 	notifyDrain
 	notifyReset
+	notifyResetByPeer
 	notifyKeepaliveChanged
 	notifyMSSChanged
 	// notifyTickleWorker is used to tickle the protocol main loop during a
@@ -127,6 +128,7 @@ const (
 	// ensures the loop terminates if the final state of the endpoint is
 	// say TIME_WAIT.
 	notifyTickleWorker
+	notifyError
 )
 
 // SACKInfo holds TCP SACK related information for a given endpoint.
@@ -283,6 +285,18 @@ func (*EndpointInfo) IsEndpointInfo() {}
 type endpoint struct {
 	EndpointInfo
 
+	// endpointEntry is used to queue endpoints for processing to the
+	// a given tcp processor goroutine.
+	//
+	// Precondition: epQueue.mu must be held to read/write this field..
+	endpointEntry `state:"nosave"`
+
+	// pendingProcessing is true if this endpoint is queued for processing
+	// to a TCP processor.
+	//
+	// Precondition: epQueue.mu must be held to read/write this field..
+	pendingProcessing bool `state:"nosave"`
+
 	// workMu is used to arbitrate which goroutine may perform protocol
 	// work. Only the main protocol goroutine is expected to call Lock() on
 	// it, but other goroutines (e.g., send) may call TryLock() to eagerly
@@ -324,6 +338,7 @@ type endpoint struct {
 	// The following fields are protected by the mutex.
 	mu sync.RWMutex `state:"nosave"`
 
+	// state must be read/set using the EndpointState()/setEndpointState() methods.
 	state EndpointState `state:".(EndpointState)"`
 
 	// origEndpointState is only used during a restore phase to save the
@@ -359,7 +374,7 @@ type endpoint struct {
 	workerRunning bool
 
 	// workerCleanup specifies if the worker goroutine must perform cleanup
-	// before exitting. This can only be set to true when workerRunning is
+	// before exiting. This can only be set to true when workerRunning is
 	// also true, and they're both protected by the mutex.
 	workerCleanup bool
 
@@ -371,6 +386,8 @@ type endpoint struct {
 	// recentTS is the timestamp that should be sent in the TSEcr field of
 	// the timestamp for future segments sent by the endpoint. This field is
 	// updated if required when a new segment is received by this endpoint.
+	//
+	// recentTS must be read/written atomically.
 	recentTS uint32
 
 	// tsOffset is a randomized offset added to the value of the
@@ -567,6 +584,47 @@ func (e *endpoint) ResumeWork() {
 	e.workMu.Unlock()
 }
 
+// setEndpointState updates the state of the endpoint to state atomically. This
+// method is unexported as the only place we should update the state is in this
+// package but we allow the state to be read freely without holding e.mu.
+//
+// Precondition: e.mu must be held to call this method.
+func (e *endpoint) setEndpointState(state EndpointState) {
+	oldstate := EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+	switch state {
+	case StateEstablished:
+		e.stack.Stats().TCP.CurrentEstablished.Increment()
+	case StateError:
+		fallthrough
+	case StateClose:
+		if oldstate == StateCloseWait || oldstate == StateEstablished {
+			e.stack.Stats().TCP.EstablishedResets.Increment()
+		}
+		fallthrough
+	default:
+		if oldstate == StateEstablished {
+			e.stack.Stats().TCP.CurrentEstablished.Decrement()
+		}
+	}
+	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
+}
+
+// EndpointState returns the current state of the endpoint.
+func (e *endpoint) EndpointState() EndpointState {
+	return EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+}
+
+// setRecentTimestamp atomically sets the recentTS field to the
+// provided value.
+func (e *endpoint) setRecentTimestamp(recentTS uint32) {
+	atomic.StoreUint32(&e.recentTS, recentTS)
+}
+
+// recentTimestamp atomically reads and returns the value of the recentTS field.
+func (e *endpoint) recentTimestamp() uint32 {
+	return atomic.LoadUint32(&e.recentTS)
+}
+
 // keepalive is a synchronization wrapper used to appease stateify. See the
 // comment in endpoint, where it is used.
 //
@@ -656,7 +714,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
-	switch e.state {
+	switch e.EndpointState() {
 	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
 		// Ready for nothing.
 
@@ -672,7 +730,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 			}
 		}
 	}
-	if e.state.connected() {
+	if e.EndpointState().connected() {
 		// Determine if the endpoint is writable if requested.
 		if (mask & waiter.EventOut) != 0 {
 			e.sndBufMu.Lock()
@@ -733,14 +791,20 @@ func (e *endpoint) Close() {
 	// Issue a shutdown so that the peer knows we won't send any more data
 	// if we're connected, or stop accepting if we're listening.
 	e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
+	e.closeNoShutdown()
+}
 
+// closeNoShutdown closes the endpoint without doing a full shutdown. This is
+// used when a connection needs to be aborted with a RST and we want to skip
+// a full 4 way TCP shutdown.
+func (e *endpoint) closeNoShutdown() {
 	e.mu.Lock()
 
 	// For listening sockets, we always release ports inline so that they
 	// are immediately available for reuse after Close() is called. If also
 	// registered, we unregister as well otherwise the next user would fail
 	// in Listen() when trying to register.
-	if e.state == StateListen && e.isPortReserved {
+	if e.EndpointState() == StateListen && e.isPortReserved {
 		if e.isRegistered {
 			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
 			e.isRegistered = false
@@ -780,6 +844,8 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 		defer close(done)
 		for n := range e.acceptedChan {
 			n.notifyProtocolGoroutine(notifyReset)
+			// close all connections that have completed but
+			// not accepted by the application.
 			n.Close()
 		}
 	}()
@@ -797,11 +863,13 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
 // after Close() is called and the worker goroutine (if any) is done with its
 // work.
 func (e *endpoint) cleanupLocked() {
+
 	// Close all endpoints that might have been accepted by TCP but not by
 	// the client.
 	if e.acceptedChan != nil {
 		e.closePendingAcceptableConnectionsLocked()
 	}
+
 	e.workerCleanup = false
 
 	if e.isRegistered {
@@ -920,7 +988,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 	// reads to proceed before returning a ECONNRESET.
 	e.rcvListMu.Lock()
 	bufUsed := e.rcvBufUsed
-	if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 {
+	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
 		e.rcvListMu.Unlock()
 		he := e.HardError
 		e.mu.RUnlock()
@@ -944,7 +1012,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	if e.rcvBufUsed == 0 {
-		if e.rcvClosed || !e.state.connected() {
+		if e.rcvClosed || !e.EndpointState().connected() {
 			return buffer.View{}, tcpip.ErrClosedForReceive
 		}
 		return buffer.View{}, tcpip.ErrWouldBlock
@@ -980,8 +1048,8 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 // Caller must hold e.mu and e.sndBufMu
 func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
 	// The endpoint cannot be written to if it's not connected.
-	if !e.state.connected() {
-		switch e.state {
+	if !e.EndpointState().connected() {
+		switch e.EndpointState() {
 		case StateError:
 			return 0, e.HardError
 		default:
@@ -1039,42 +1107,86 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		return 0, nil, perr
 	}
 
-	if !opts.Atomic { // See above.
-		e.mu.RLock()
-		e.sndBufMu.Lock()
+	if opts.Atomic {
+		// Add data to the send queue.
+		s := newSegmentFromView(&e.route, e.ID, v)
+		e.sndBufUsed += len(v)
+		e.sndBufInQueue += seqnum.Size(len(v))
+		e.sndQueue.PushBack(s)
+		e.sndBufMu.Unlock()
+		// Release the endpoint lock to prevent deadlocks due to lock
+		// order inversion when acquiring workMu.
+		e.mu.RUnlock()
+	}
 
-		// Because we released the lock before copying, check state again
-		// to make sure the endpoint is still in a valid state for a write.
-		avail, err = e.isEndpointWritableLocked()
-		if err != nil {
+	if e.workMu.TryLock() {
+		// Since we released locks in between it's possible that the
+		// endpoint transitioned to a CLOSED/ERROR states so make
+		// sure endpoint is still writable before trying to write.
+		if !opts.Atomic { // See above.
+			e.mu.RLock()
+			e.sndBufMu.Lock()
+
+			// Because we released the lock before copying, check state again
+			// to make sure the endpoint is still in a valid state for a write.
+			avail, err = e.isEndpointWritableLocked()
+			if err != nil {
+				e.sndBufMu.Unlock()
+				e.mu.RUnlock()
+				e.stats.WriteErrors.WriteClosed.Increment()
+				return 0, nil, err
+			}
+
+			// Discard any excess data copied in due to avail being reduced due
+			// to a simultaneous write call to the socket.
+			if avail < len(v) {
+				v = v[:avail]
+			}
+			// Add data to the send queue.
+			s := newSegmentFromView(&e.route, e.ID, v)
+			e.sndBufUsed += len(v)
+			e.sndBufInQueue += seqnum.Size(len(v))
+			e.sndQueue.PushBack(s)
 			e.sndBufMu.Unlock()
+			// Release the endpoint lock to prevent deadlocks due to lock
+			// order inversion when acquiring workMu.
 			e.mu.RUnlock()
-			e.stats.WriteErrors.WriteClosed.Increment()
-			return 0, nil, err
-		}
 
-		// Discard any excess data copied in due to avail being reduced due
-		// to a simultaneous write call to the socket.
-		if avail < len(v) {
-			v = v[:avail]
 		}
-	}
-
-	// Add data to the send queue.
-	s := newSegmentFromView(&e.route, e.ID, v)
-	e.sndBufUsed += len(v)
-	e.sndBufInQueue += seqnum.Size(len(v))
-	e.sndQueue.PushBack(s)
-	e.sndBufMu.Unlock()
-	// Release the endpoint lock to prevent deadlocks due to lock
-	// order inversion when acquiring workMu.
-	e.mu.RUnlock()
-
-	if e.workMu.TryLock() {
 		// Do the work inline.
 		e.handleWrite()
 		e.workMu.Unlock()
 	} else {
+		if !opts.Atomic { // See above.
+			e.mu.RLock()
+			e.sndBufMu.Lock()
+
+			// Because we released the lock before copying, check state again
+			// to make sure the endpoint is still in a valid state for a write.
+			avail, err = e.isEndpointWritableLocked()
+			if err != nil {
+				e.sndBufMu.Unlock()
+				e.mu.RUnlock()
+				e.stats.WriteErrors.WriteClosed.Increment()
+				return 0, nil, err
+			}
+
+			// Discard any excess data copied in due to avail being reduced due
+			// to a simultaneous write call to the socket.
+			if avail < len(v) {
+				v = v[:avail]
+			}
+			// Add data to the send queue.
+			s := newSegmentFromView(&e.route, e.ID, v)
+			e.sndBufUsed += len(v)
+			e.sndBufInQueue += seqnum.Size(len(v))
+			e.sndQueue.PushBack(s)
+			e.sndBufMu.Unlock()
+			// Release the endpoint lock to prevent deadlocks due to lock
+			// order inversion when acquiring workMu.
+			e.mu.RUnlock()
+
+		}
 		// Let the protocol goroutine do the work.
 		e.sndWaker.Assert()
 	}
@@ -1091,7 +1203,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data.
-	if s := e.state; !s.connected() && s != StateClose {
+	if s := e.EndpointState(); !s.connected() && s != StateClose {
 		if s == StateError {
 			return 0, tcpip.ControlMessages{}, e.HardError
 		}
@@ -1103,7 +1215,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	defer e.rcvListMu.Unlock()
 
 	if e.rcvBufUsed == 0 {
-		if e.rcvClosed || !e.state.connected() {
+		if e.rcvClosed || !e.EndpointState().connected() {
 			e.stats.ReadErrors.ReadClosed.Increment()
 			return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
 		}
@@ -1187,7 +1299,7 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		defer e.mu.Unlock()
 
 		// We only allow this to be set when we're in the initial state.
-		if e.state != StateInitial {
+		if e.EndpointState() != StateInitial {
 			return tcpip.ErrInvalidEndpointState
 		}
 
@@ -1402,14 +1514,14 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 				// Acquire the work mutex as we may need to
 				// reinitialize the congestion control state.
 				e.mu.Lock()
-				state := e.state
+				state := e.EndpointState()
 				e.cc = v
 				e.mu.Unlock()
 				switch state {
 				case StateEstablished:
 					e.workMu.Lock()
 					e.mu.Lock()
-					if e.state == state {
+					if e.EndpointState() == state {
 						e.snd.cc = e.snd.initCongestionControl(e.cc)
 					}
 					e.mu.Unlock()
@@ -1472,7 +1584,7 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
 	defer e.mu.RUnlock()
 
 	// The endpoint cannot be in listen state.
-	if e.state == StateListen {
+	if e.EndpointState() == StateListen {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
@@ -1731,7 +1843,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		return err
 	}
 
-	if e.state.connected() {
+	if e.EndpointState().connected() {
 		// The endpoint is already connected. If caller hasn't been
 		// notified yet, return success.
 		if !e.isConnectNotified {
@@ -1743,7 +1855,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	}
 
 	nicID := addr.NIC
-	switch e.state {
+	switch e.EndpointState() {
 	case StateBound:
 		// If we're already bound to a NIC but the caller is requesting
 		// that we use a different one now, we cannot proceed.
@@ -1850,7 +1962,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	}
 
 	e.isRegistered = true
-	e.state = StateConnecting
+	e.setEndpointState(StateConnecting)
 	e.route = r.Clone()
 	e.boundNICID = nicID
 	e.effectiveNetProtos = netProtos
@@ -1871,14 +1983,13 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		}
 		e.segmentQueue.mu.Unlock()
 		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
-		e.state = StateEstablished
-		e.stack.Stats().TCP.CurrentEstablished.Increment()
+		e.setEndpointState(StateEstablished)
 	}
 
 	if run {
 		e.workerRunning = true
 		e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
-		go e.protocolMainLoop(handshake) // S/R-SAFE: will be drained before save.
+		go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save.
 	}
 
 	return tcpip.ErrConnectStarted
@@ -1896,7 +2007,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 	e.shutdownFlags |= flags
 	finQueued := false
 	switch {
-	case e.state.connected():
+	case e.EndpointState().connected():
 		// Close for read.
 		if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
 			// Mark read side as closed.
@@ -1908,8 +2019,23 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			// If we're fully closed and we have unread data we need to abort
 			// the connection with a RST.
 			if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
-				e.notifyProtocolGoroutine(notifyReset)
+				// Move the socket to error state immediately.
+				// This is done redundantly because in case of
+				// save/restore on a Shutdown/Close() the socket
+				// state needs to indicate the error otherwise
+				// save file will show the socket in established
+				// state even though snd/rcv are closed.
 				e.mu.Unlock()
+				// Try to send an active reset immediately if the
+				// work mutex is available.
+				if e.workMu.TryLock() {
+					e.mu.Lock()
+					e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+					e.mu.Unlock()
+					e.workMu.Unlock()
+				} else {
+					e.notifyProtocolGoroutine(notifyReset)
+				}
 				return nil
 			}
 		}
@@ -1931,11 +2057,10 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 			finQueued = true
 			// Mark endpoint as closed.
 			e.sndClosed = true
-
 			e.sndBufMu.Unlock()
 		}
 
-	case e.state == StateListen:
+	case e.EndpointState() == StateListen:
 		// Tell protocolListenLoop to stop.
 		if flags&tcpip.ShutdownRead != 0 {
 			e.notifyProtocolGoroutine(notifyClose)
@@ -1976,7 +2101,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	// When the endpoint shuts down, it sets workerCleanup to true, and from
 	// that point onward, acceptedChan is the responsibility of the cleanup()
 	// method (and should not be touched anywhere else, including here).
-	if e.state == StateListen && !e.workerCleanup {
+	if e.EndpointState() == StateListen && !e.workerCleanup {
 		// Adjust the size of the channel iff we can fix existing
 		// pending connections into the new one.
 		if len(e.acceptedChan) > backlog {
@@ -1994,7 +2119,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 		return nil
 	}
 
-	if e.state == StateInitial {
+	if e.EndpointState() == StateInitial {
 		// The listen is called on an unbound socket, the socket is
 		// automatically bound to a random free port with the local
 		// address set to INADDR_ANY.
@@ -2004,7 +2129,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	}
 
 	// Endpoint must be bound before it can transition to listen mode.
-	if e.state != StateBound {
+	if e.EndpointState() != StateBound {
 		e.stats.ReadErrors.InvalidEndpointState.Increment()
 		return tcpip.ErrInvalidEndpointState
 	}
@@ -2015,24 +2140,27 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	}
 
 	e.isRegistered = true
-	e.state = StateListen
+	e.setEndpointState(StateListen)
+
 	if e.acceptedChan == nil {
 		e.acceptedChan = make(chan *endpoint, backlog)
 	}
 	e.workerRunning = true
-
 	go e.protocolListenLoop( // S/R-SAFE: drained on save.
 		seqnum.Size(e.receiveBufferAvailable()))
-
 	return nil
 }
 
 // startAcceptedLoop sets up required state and starts a goroutine with the
 // main loop for accepted connections.
 func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
+	e.mu.Lock()
 	e.waiterQueue = waiterQueue
 	e.workerRunning = true
-	go e.protocolMainLoop(false) // S/R-SAFE: drained on save.
+	e.mu.Unlock()
+	wakerInitDone := make(chan struct{})
+	go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save.
+	<-wakerInitDone
 }
 
 // Accept returns a new endpoint if a peer has established a connection
@@ -2042,7 +2170,7 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	defer e.mu.RUnlock()
 
 	// Endpoint must be in listen state before it can accept connections.
-	if e.state != StateListen {
+	if e.EndpointState() != StateListen {
 		return nil, nil, tcpip.ErrInvalidEndpointState
 	}
 
@@ -2069,7 +2197,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	// Don't allow binding once endpoint is not in the initial state
 	// anymore. This is because once the endpoint goes into a connected or
 	// listen state, it is already bound.
-	if e.state != StateInitial {
+	if e.EndpointState() != StateInitial {
 		return tcpip.ErrAlreadyBound
 	}
 
@@ -2131,7 +2259,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	}
 
 	// Mark endpoint as bound.
-	e.state = StateBound
+	e.setEndpointState(StateBound)
 
 	return nil
 }
@@ -2153,7 +2281,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
-	if !e.state.connected() {
+	if !e.EndpointState().connected() {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
 	}
 
@@ -2164,45 +2292,22 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	}, nil
 }
 
-// HandlePacket is called by the stack when new packets arrive to this transport
-// endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
-	s := newSegment(r, id, pkt)
-	if !s.parse() {
-		e.stack.Stats().MalformedRcvdPackets.Increment()
-		e.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
-		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
-		s.decRef()
-		return
-	}
-
-	if !s.csumValid {
-		e.stack.Stats().MalformedRcvdPackets.Increment()
-		e.stack.Stats().TCP.ChecksumErrors.Increment()
-		e.stats.ReceiveErrors.ChecksumErrors.Increment()
-		s.decRef()
-		return
-	}
-
-	e.stack.Stats().TCP.ValidSegmentsReceived.Increment()
-	e.stats.SegmentsReceived.Increment()
-	if (s.flags & header.TCPFlagRst) != 0 {
-		e.stack.Stats().TCP.ResetsReceived.Increment()
-	}
-
-	e.enqueueSegment(s)
+	// TCP HandlePacket is not required anymore as inbound packets first
+	// land at the Dispatcher which then can either delivery using the
+	// worker go routine or directly do the invoke the tcp processing inline
+	// based on the state of the endpoint.
 }
 
-func (e *endpoint) enqueueSegment(s *segment) {
+func (e *endpoint) enqueueSegment(s *segment) bool {
 	// Send packet to worker goroutine.
-	if e.segmentQueue.enqueue(s) {
-		e.newSegmentWaker.Assert()
-	} else {
+	if !e.segmentQueue.enqueue(s) {
 		// The queue is full, so we drop the segment.
 		e.stack.Stats().DroppedPackets.Increment()
 		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
-		s.decRef()
+		return false
 	}
+	return true
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
@@ -2319,8 +2424,8 @@ func (e *endpoint) rcvWndScaleForHandshake() int {
 // updateRecentTimestamp updates the recent timestamp using the algorithm
 // described in https://tools.ietf.org/html/rfc7323#section-4.3
 func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
-	if e.sendTSOk && seqnum.Value(e.recentTS).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
-		e.recentTS = tsVal
+	if e.sendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
+		e.setRecentTimestamp(tsVal)
 	}
 }
 
@@ -2330,7 +2435,7 @@ func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value,
 func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
 	if synOpts.TS {
 		e.sendTSOk = true
-		e.recentTS = synOpts.TSVal
+		e.setRecentTimestamp(synOpts.TSVal)
 	}
 }
 
@@ -2419,7 +2524,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 
 	// Endpoint TCP Option state.
 	s.SendTSOk = e.sendTSOk
-	s.RecentTS = e.recentTS
+	s.RecentTS = e.recentTimestamp()
 	s.TSOffset = e.tsOffset
 	s.SACKPermitted = e.sackPermitted
 	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
@@ -2526,9 +2631,7 @@ func (e *endpoint) initGSO() {
 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol
 // state for diagnostics.
 func (e *endpoint) State() uint32 {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	return uint32(e.state)
+	return uint32(e.EndpointState())
 }
 
 // Info returns a copy of the endpoint info.
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 4b8d867bc..4a46f0ec5 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -16,6 +16,7 @@ package tcp
 
 import (
 	"fmt"
+	"sync/atomic"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sync"
@@ -48,7 +49,7 @@ func (e *endpoint) beforeSave() {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	switch e.state {
+	switch e.EndpointState() {
 	case StateInitial, StateBound:
 		// TODO(b/138137272): this enumeration duplicates
 		// EndpointState.connected. remove it.
@@ -70,31 +71,30 @@ func (e *endpoint) beforeSave() {
 		fallthrough
 	case StateListen, StateConnecting:
 		e.drainSegmentLocked()
-		if e.state != StateClose && e.state != StateError {
+		if e.EndpointState() != StateClose && e.EndpointState() != StateError {
 			if !e.workerRunning {
 				panic("endpoint has no worker running in listen, connecting, or connected state")
 			}
 			break
 		}
-		fallthrough
 	case StateError, StateClose:
-		for (e.state == StateError || e.state == StateClose) && e.workerRunning {
+		for e.workerRunning {
 			e.mu.Unlock()
 			time.Sleep(100 * time.Millisecond)
 			e.mu.Lock()
 		}
 		if e.workerRunning {
-			panic("endpoint still has worker running in closed or error state")
+			panic(fmt.Sprintf("endpoint: %+v still has worker running in closed or error state", e.ID))
 		}
 	default:
-		panic(fmt.Sprintf("endpoint in unknown state %v", e.state))
+		panic(fmt.Sprintf("endpoint in unknown state %v", e.EndpointState()))
 	}
 
 	if e.waiterQueue != nil && !e.waiterQueue.IsEmpty() {
 		panic("endpoint still has waiters upon save")
 	}
 
-	if e.state != StateClose && !((e.state == StateBound || e.state == StateListen) == e.isPortReserved) {
+	if e.EndpointState() != StateClose && !((e.EndpointState() == StateBound || e.EndpointState() == StateListen) == e.isPortReserved) {
 		panic("endpoints which are not in the closed state must have a reserved port IFF they are in bound or listen state")
 	}
 }
@@ -135,7 +135,7 @@ func (e *endpoint) loadAcceptedChan(acceptedEndpoints []*endpoint) {
 
 // saveState is invoked by stateify.
 func (e *endpoint) saveState() EndpointState {
-	return e.state
+	return e.EndpointState()
 }
 
 // Endpoint loading must be done in the following ordering by their state, to
@@ -151,7 +151,8 @@ var connectingLoading sync.WaitGroup
 func (e *endpoint) loadState(state EndpointState) {
 	// This is to ensure that the loading wait groups include all applicable
 	// endpoints before any asynchronous calls to the Wait() methods.
-	if state.connected() {
+	// For restore purposes we treat TimeWait like a connected endpoint.
+	if state.connected() || state == StateTimeWait {
 		connectedLoading.Add(1)
 	}
 	switch state {
@@ -160,13 +161,14 @@ func (e *endpoint) loadState(state EndpointState) {
 	case StateConnecting, StateSynSent, StateSynRecv:
 		connectingLoading.Add(1)
 	}
-	e.state = state
+	// Directly update the state here rather than using e.setEndpointState
+	// as the endpoint is still being loaded and the stack reference to increment
+	// metrics is not yet initialized.
+	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
 }
 
 // afterLoad is invoked by stateify.
 func (e *endpoint) afterLoad() {
-	// Freeze segment queue before registering to prevent any segments
-	// from being delivered while it is being restored.
 	e.origEndpointState = e.state
 	// Restore the endpoint to InitialState as it will be moved to
 	// its origEndpointState during Resume.
@@ -180,7 +182,6 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
 	e.workMu.Init()
 	state := e.origEndpointState
-
 	switch state {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
 		var ss SendBufferSizeOption
@@ -276,7 +277,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 				listenLoading.Wait()
 				connectingLoading.Wait()
 				bind()
-				e.state = StateClose
+				e.setEndpointState(StateClose)
 				tcpip.AsyncLoading.Done()
 			}()
 		}
@@ -288,6 +289,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
 	}
+
 }
 
 // saveLastError is invoked by stateify.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 9a8f64aa6..958c06fa7 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -21,6 +21,7 @@
 package tcp
 
 import (
+	"runtime"
 	"strings"
 	"time"
 
@@ -104,6 +105,7 @@ type protocol struct {
 	moderateReceiveBuffer      bool
 	tcpLingerTimeout           time.Duration
 	tcpTimeWaitTimeout         time.Duration
+	dispatcher                 *dispatcher
 }
 
 // Number returns the tcp protocol number.
@@ -134,6 +136,14 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 	return h.SourcePort(), h.DestinationPort(), nil
 }
 
+// QueuePacket queues packets targeted at an endpoint after hashing the packet
+// to a specific processing queue. Each queue is serviced by its own processor
+// goroutine which is responsible for dequeuing and doing full TCP dispatch of
+// the packet.
+func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+	p.dispatcher.queuePacket(r, ep, id, pkt)
+}
+
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
 //
@@ -330,5 +340,6 @@ func NewProtocol() stack.TransportProtocol {
 		availableCongestionControl: []string{ccReno, ccCubic},
 		tcpLingerTimeout:           DefaultTCPLingerTimeout,
 		tcpTimeWaitTimeout:         DefaultTCPTimeWaitTimeout,
+		dispatcher:                 newDispatcher(runtime.GOMAXPROCS(0)),
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 05c8488f8..958f03ac1 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -169,19 +169,19 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 		// We just received a FIN, our next state depends on whether we sent a
 		// FIN already or not.
 		r.ep.mu.Lock()
-		switch r.ep.state {
+		switch r.ep.EndpointState() {
 		case StateEstablished:
-			r.ep.state = StateCloseWait
+			r.ep.setEndpointState(StateCloseWait)
 		case StateFinWait1:
 			if s.flagIsSet(header.TCPFlagAck) {
 				// FIN-ACK, transition to TIME-WAIT.
-				r.ep.state = StateTimeWait
+				r.ep.setEndpointState(StateTimeWait)
 			} else {
 				// Simultaneous close, expecting a final ACK.
-				r.ep.state = StateClosing
+				r.ep.setEndpointState(StateClosing)
 			}
 		case StateFinWait2:
-			r.ep.state = StateTimeWait
+			r.ep.setEndpointState(StateTimeWait)
 		}
 		r.ep.mu.Unlock()
 
@@ -205,16 +205,16 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
 	// shutdown states.
 	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt {
 		r.ep.mu.Lock()
-		switch r.ep.state {
+		switch r.ep.EndpointState() {
 		case StateFinWait1:
-			r.ep.state = StateFinWait2
+			r.ep.setEndpointState(StateFinWait2)
 			// Notify protocol goroutine that we have received an
 			// ACK to our FIN so that it can start the FIN_WAIT2
 			// timer to abort connection if the other side does
 			// not close within 2MSL.
 			r.ep.notifyProtocolGoroutine(notifyClose)
 		case StateClosing:
-			r.ep.state = StateTimeWait
+			r.ep.setEndpointState(StateTimeWait)
 		case StateLastAck:
 			r.ep.transitionToStateCloseLocked()
 		}
@@ -267,7 +267,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 	switch state {
 	case StateCloseWait, StateClosing, StateLastAck:
 		if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
-			s.decRef()
 			// Just drop the segment as we have
 			// already received a FIN and this
 			// segment is after the sequence number
@@ -284,7 +283,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 		// trigger a RST.
 		endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
 		if rcvClosed && r.rcvNxt.LessThan(endDataSeq) {
-			s.decRef()
 			return true, tcpip.ErrConnectionAborted
 		}
 		if state == StateFinWait1 {
@@ -314,7 +312,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 		// the last actual data octet in a segment in
 		// which it occurs.
 		if closed && (!s.flagIsSet(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.rcvNxt+1) {
-			s.decRef()
 			return true, tcpip.ErrConnectionAborted
 		}
 	}
@@ -336,7 +333,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
 // r as they arrive. It is called by the protocol main loop.
 func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 	r.ep.mu.RLock()
-	state := r.ep.state
+	state := r.ep.EndpointState()
 	closed := r.ep.closed
 	r.ep.mu.RUnlock()
 
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index fdff7ed81..b74b61e7d 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -705,17 +705,15 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		}
 		seg.flags = header.TCPFlagAck | header.TCPFlagFin
 		segEnd = seg.sequenceNumber.Add(1)
-		// Transition to FIN-WAIT1 state since we're initiating an active close.
-		s.ep.mu.Lock()
-		switch s.ep.state {
+		// Update the state to reflect that we have now
+		// queued a FIN.
+		switch s.ep.EndpointState() {
 		case StateCloseWait:
-			// We've already received a FIN and are now sending our own. The
-			// sender is now awaiting a final ACK for this FIN.
-			s.ep.state = StateLastAck
+			s.ep.setEndpointState(StateLastAck)
 		default:
-			s.ep.state = StateFinWait1
+			s.ep.setEndpointState(StateFinWait1)
 		}
-		s.ep.mu.Unlock()
+
 	} else {
 		// We're sending a non-FIN segment.
 		if seg.flags&header.TCPFlagFin != 0 {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 6edfa8dce..a9dfbe857 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -293,7 +293,6 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 		checker.SeqNum(uint32(c.IRS+1)),
 		checker.AckNum(uint32(iss)+1),
 		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
-
 	finHeaders := &context.Headers{
 		SrcPort: context.TestPort,
 		DstPort: context.StackPort,
@@ -459,6 +458,9 @@ func TestConnectResetAfterClose(t *testing.T) {
 		checker.IPv4(t, b,
 			checker.TCP(
 				checker.DstPort(context.TestPort),
+				// RST is always generated with sndNxt which if the FIN
+				// has been sent will be 1 higher than the sequence number
+				// of the FIN itself.
 				checker.SeqNum(uint32(c.IRS)+2),
 				checker.AckNum(0),
 				checker.TCPFlags(header.TCPFlagRst),
@@ -1500,6 +1502,9 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 		checker.TCP(
 			checker.DstPort(context.TestPort),
 			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+			// RST is always generated with sndNxt which if the FIN
+			// has been sent will be 1 higher than the sequence
+			// number of the FIN itself.
 			checker.SeqNum(uint32(c.IRS)+2),
 		))
 	// The RST puts the endpoint into an error state.
@@ -5441,6 +5446,7 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 		rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
 		packetsSent++
 	}
+
 	// Resume the worker so that it only sees the packets once all of them
 	// are waiting to be read.
 	worker.ResumeWork()
@@ -5508,7 +5514,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 	stk := c.Stack()
 	// Set lower limits for auto-tuning tests. This is required because the
 	// test stops the worker which can cause packets to be dropped because
-	// the segment queue holding unprocessed packets is limited to 500.
+	// the segment queue holding unprocessed packets is limited to 300.
 	const receiveBufferSize = 80 << 10 // 80KB.
 	const maxReceiveBufferSize = receiveBufferSize * 10
 	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{1, receiveBufferSize, maxReceiveBufferSize}); err != nil {
@@ -5563,6 +5569,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 			totalSent += mss
 			packetsSent++
 		}
+
 		// Resume it so that it only sees the packets once all of them
 		// are waiting to be read.
 		worker.ResumeWork()
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 5d114d460..2f9821555 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -533,7 +533,7 @@ TEST_P(SocketInetLoopbackTest, TCPFinWait2Test_NoRandomSave) {
 
   // Sleep for a little over the linger timeout to reduce flakiness in
   // save/restore tests.
-  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+  absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 2));
 
   ds.reset();
 
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 6b99c021d..33a5ac66c 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -814,6 +814,20 @@ TEST_P(TcpSocketTest, FullBuffer) {
   t_ = -1;
 }
 
+TEST_P(TcpSocketTest, PollAfterShutdown) {
+  ScopedThread client_thread([this]() {
+    EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceedsWithValue(0));
+    struct pollfd poll_fd = {s_, POLLIN | POLLERR | POLLHUP, 0};
+    EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+                SyscallSucceedsWithValue(1));
+  });
+
+  EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceedsWithValue(0));
+  struct pollfd poll_fd = {t_, POLLIN | POLLERR | POLLHUP, 0};
+  EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+              SyscallSucceedsWithValue(1));
+}
+
 TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) {
   // Initialize address to the loopback one.
   sockaddr_storage addr =
-- 
cgit v1.2.3


From c50efc8c700fa2628f1415daeeb3b382009eb1bb Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Thu, 16 Jan 2020 12:48:05 -0800
Subject: Disable xattr tests.

These can remain disabled until we actually support extended attributes.

The following modifications were also made:
1. Disable save/restore on tests that change file permissions. Restore will not
work properly for these tests, since it will try to open the file with
read-write after it has been read- or write-only.
2. Change user.abc to user.test.

PiperOrigin-RevId: 290123941
---
 test/syscalls/BUILD          |   5 --
 test/syscalls/linux/xattr.cc | 152 ++++++++++++-------------------------------
 2 files changed, 42 insertions(+), 115 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index a3a85917d..829693e8e 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -717,11 +717,6 @@ syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
 
 syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
 
-syscall_test(
-    add_overlay = True,
-    test = "//test/syscalls/linux:xattr_test",
-)
-
 go_binary(
     name = "syscall_test_runner",
     testonly = 1,
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index 75740238c..b3bc3463e 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -59,7 +59,8 @@ TEST_F(XattrTest, XattrLargeName) {
   std::string name = "user.";
   name += std::string(XATTR_NAME_MAX - name.length(), 'a');
 
-  // TODO(b/127675828): Support setxattr and getxattr.
+  // An xattr should be whitelisted before it can be accessed--do not allow
+  // arbitrary xattrs to be read/written in gVisor.
   if (!IsRunningOnGvisor()) {
     EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
                 SyscallSucceeds());
@@ -83,59 +84,53 @@ TEST_F(XattrTest, XattrInvalidPrefix) {
               SyscallFailsWithErrno(EOPNOTSUPP));
 }
 
-TEST_F(XattrTest, XattrReadOnly) {
+// Do not allow save/restore cycles after making the test file read-only, as
+// the restore will fail to open it with r/w permissions.
+TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
 
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
-                SyscallSucceeds());
-  }
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
 
+  DisableSave ds;
   ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IRUSR));
 
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
               SyscallFailsWithErrno(EACCES));
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    char buf = '-';
-    EXPECT_THAT(getxattr(path, name, &buf, size),
-                SyscallSucceedsWithValue(size));
-    EXPECT_EQ(buf, val);
-  }
+  char buf = '-';
+  EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
 }
 
-TEST_F(XattrTest, XattrWriteOnly) {
+// Do not allow save/restore cycles after making the test file write-only, as
+// the restore will fail to open it with r/w permissions.
+TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
   // Drop capabilities that allow us to override file and directory permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
 
+  DisableSave ds;
   ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IWUSR));
 
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
-                SyscallSucceeds());
-  }
+  EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
 }
 
 TEST_F(XattrTest, XattrTrustedWithNonadmin) {
-  // TODO(b/127675828): Support setxattr and getxattr.
+  // TODO(b/127675828): Support setxattr and getxattr with "trusted" prefix.
   SKIP_IF(IsRunningOnGvisor());
   SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
 
@@ -147,11 +142,8 @@ TEST_F(XattrTest, XattrTrustedWithNonadmin) {
 }
 
 TEST_F(XattrTest, XattrOnDirectory) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
               SyscallSucceeds());
   EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0),
@@ -159,13 +151,10 @@ TEST_F(XattrTest, XattrOnDirectory) {
 }
 
 TEST_F(XattrTest, XattrOnSymlink) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0),
               SyscallSucceeds());
   EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0),
@@ -173,7 +162,7 @@ TEST_F(XattrTest, XattrOnSymlink) {
 }
 
 TEST_F(XattrTest, XattrOnInvalidFileTypes) {
-  char name[] = "user.abc";
+  const char name[] = "user.test";
 
   char char_device[] = "/dev/zero";
   EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0),
@@ -191,11 +180,8 @@ TEST_F(XattrTest, XattrOnInvalidFileTypes) {
 }
 
 TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   size_t size = 1;
   EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
@@ -209,11 +195,8 @@ TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, SetxattrZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
 
@@ -225,7 +208,7 @@ TEST_F(XattrTest, SetxattrZeroSize) {
 
 TEST_F(XattrTest, SetxattrSizeTooLarge) {
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
 
   // Note that each particular fs implementation may stipulate a lower size
   // limit, in which case we actually may fail (e.g. error with ENOSPC) for
@@ -235,43 +218,29 @@ TEST_F(XattrTest, SetxattrSizeTooLarge) {
   EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
               SyscallFailsWithErrno(E2BIG));
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(getxattr(path, name, nullptr, 0),
-                SyscallFailsWithErrno(ENODATA));
-  }
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
               SyscallFailsWithErrno(EFAULT));
 
-  // TODO(b/127675828): Support setxattr and getxattr.
-  if (!IsRunningOnGvisor()) {
-    EXPECT_THAT(getxattr(path, name, nullptr, 0),
-                SyscallFailsWithErrno(ENODATA));
-  }
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
 
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
 }
 
 TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val(XATTR_SIZE_MAX + 1);
   std::fill(val.begin(), val.end(), 'a');
   size_t size = 1;
@@ -286,11 +255,8 @@ TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
               SyscallSucceeds());
@@ -304,11 +270,8 @@ TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceWithLarger) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
               SyscallSucceeds());
@@ -321,11 +284,8 @@ TEST_F(XattrTest, SetxattrReplaceWithLarger) {
 }
 
 TEST_F(XattrTest, SetxattrCreateFlag) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
               SyscallSucceeds());
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
@@ -335,11 +295,8 @@ TEST_F(XattrTest, SetxattrCreateFlag) {
 }
 
 TEST_F(XattrTest, SetxattrReplaceFlag) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
               SyscallFailsWithErrno(ENODATA));
   EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
@@ -357,11 +314,8 @@ TEST_F(XattrTest, SetxattrInvalidFlags) {
 }
 
 TEST_F(XattrTest, Getxattr) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   int val = 1234;
   size_t size = sizeof(val);
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
@@ -372,11 +326,8 @@ TEST_F(XattrTest, Getxattr) {
 }
 
 TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   std::vector<char> val = {'a', 'a'};
   size_t size = val.size();
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
@@ -387,11 +338,8 @@ TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, 1, /*flags=*/0), SyscallSucceeds());
 
@@ -405,11 +353,8 @@ TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
 }
 
 TEST_F(XattrTest, GetxattrZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
               SyscallSucceeds());
@@ -421,11 +366,8 @@ TEST_F(XattrTest, GetxattrZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrSizeTooLarge) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
               SyscallSucceeds());
@@ -440,11 +382,8 @@ TEST_F(XattrTest, GetxattrSizeTooLarge) {
 }
 
 TEST_F(XattrTest, GetxattrNullValue) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
   EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
@@ -454,11 +393,8 @@ TEST_F(XattrTest, GetxattrNullValue) {
 }
 
 TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
-  // TODO(b/127675828): Support setxattr and getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  char name[] = "user.abc";
+  const char name[] = "user.test";
   char val = 'a';
   size_t size = sizeof(val);
   // Set value with zero size.
@@ -473,13 +409,9 @@ TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
 }
 
 TEST_F(XattrTest, GetxattrNonexistentName) {
-  // TODO(b/127675828): Support getxattr.
-  SKIP_IF(IsRunningOnGvisor());
-
   const char* path = test_file_name_.c_str();
-  std::string name = "user.nonexistent";
-  EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
-              SyscallFailsWithErrno(ENODATA));
+  const char name[] = "user.test";
+  EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 9073521098ee52cdda74a193565b7bbe75d8c35a Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 17 Jan 2020 13:31:26 -0800
Subject: Convert EventMask to uint64

It is used for signalfd where the maximum signal is 64.

PiperOrigin-RevId: 290331008
---
 pkg/waiter/waiter.go            |   2 +-
 test/syscalls/BUILD             |   2 +
 test/syscalls/linux/signalfd.cc | 118 ++++++++++++++++++++++++----------------
 3 files changed, 74 insertions(+), 48 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index f708e95fa..707eb085b 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -62,7 +62,7 @@ import (
 )
 
 // EventMask represents io events as used in the poll() syscall.
-type EventMask uint16
+type EventMask uint64
 
 // Events that waiters can wait on. The meaning is the same as those in the
 // poll() syscall.
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 829693e8e..90d52e73b 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -380,6 +380,8 @@ syscall_test(test = "//test/syscalls/linux:rseq_test")
 
 syscall_test(test = "//test/syscalls/linux:rtsignal_test")
 
+syscall_test(test = "//test/syscalls/linux:signalfd_test")
+
 syscall_test(test = "//test/syscalls/linux:sched_test")
 
 syscall_test(test = "//test/syscalls/linux:sched_yield_test")
diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc
index 09ecad34a..95be4b66c 100644
--- a/test/syscalls/linux/signalfd.cc
+++ b/test/syscalls/linux/signalfd.cc
@@ -39,6 +39,7 @@ namespace testing {
 namespace {
 
 constexpr int kSigno = SIGUSR1;
+constexpr int kSignoMax = 64;  // SIGRTMAX
 constexpr int kSignoAlt = SIGUSR2;
 
 // Returns a new signalfd.
@@ -51,41 +52,45 @@ inline PosixErrorOr<FileDescriptor> NewSignalFD(sigset_t* mask, int flags = 0) {
   return FileDescriptor(fd);
 }
 
-TEST(Signalfd, Basic) {
+class SignalfdTest : public ::testing::TestWithParam<int> {};
+
+TEST_P(SignalfdTest, Basic) {
+  int signo = GetParam();
   // Create the signalfd.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Deliver the blocked signal.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
 
   // We should now read the signal.
   struct signalfd_siginfo rbuf;
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 }
 
-TEST(Signalfd, MaskWorks) {
+TEST_P(SignalfdTest, MaskWorks) {
+  int signo = GetParam();
   // Create two signalfds with different masks.
   sigset_t mask1, mask2;
   sigemptyset(&mask1);
   sigemptyset(&mask2);
-  sigaddset(&mask1, kSigno);
+  sigaddset(&mask1, signo);
   sigaddset(&mask2, kSignoAlt);
   FileDescriptor fd1 = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask1, 0));
   FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask2, 0));
 
   // Deliver the two signals.
   const auto scoped_sigmask1 =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
   const auto scoped_sigmask2 =
       ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSignoAlt));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
   ASSERT_THAT(tgkill(getpid(), gettid(), kSignoAlt), SyscallSucceeds());
 
   // We should see the signals on the appropriate signalfds.
@@ -98,7 +103,7 @@ TEST(Signalfd, MaskWorks) {
   EXPECT_EQ(rbuf2.ssi_signo, kSignoAlt);
   ASSERT_THAT(read(fd1.get(), &rbuf1, sizeof(rbuf1)),
               SyscallSucceedsWithValue(sizeof(rbuf1)));
-  EXPECT_EQ(rbuf1.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf1.ssi_signo, signo);
 }
 
 TEST(Signalfd, Cloexec) {
@@ -111,11 +116,12 @@ TEST(Signalfd, Cloexec) {
   EXPECT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
 }
 
-TEST(Signalfd, Blocking) {
+TEST_P(SignalfdTest, Blocking) {
+  int signo = GetParam();
   // Create the signalfd in blocking mode.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Shared tid variable.
@@ -136,7 +142,7 @@ TEST(Signalfd, Blocking) {
     struct signalfd_siginfo rbuf;
     ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
                 SyscallSucceedsWithValue(sizeof(rbuf)));
-    EXPECT_EQ(rbuf.ssi_signo, kSigno);
+    EXPECT_EQ(rbuf.ssi_signo, signo);
   });
 
   // Wait until blocked.
@@ -149,20 +155,21 @@ TEST(Signalfd, Blocking) {
   //
   // See gvisor.dev/issue/139.
   if (IsRunningOnGvisor()) {
-    ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+    ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
   } else {
-    ASSERT_THAT(tgkill(getpid(), tid, kSigno), SyscallSucceeds());
+    ASSERT_THAT(tgkill(getpid(), tid, signo), SyscallSucceeds());
   }
 
   // Ensure that it was received.
   t.Join();
 }
 
-TEST(Signalfd, ThreadGroup) {
+TEST_P(SignalfdTest, ThreadGroup) {
+  int signo = GetParam();
   // Create the signalfd in blocking mode.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Shared variable.
@@ -176,7 +183,7 @@ TEST(Signalfd, ThreadGroup) {
     struct signalfd_siginfo rbuf;
     ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
                 SyscallSucceedsWithValue(sizeof(rbuf)));
-    EXPECT_EQ(rbuf.ssi_signo, kSigno);
+    EXPECT_EQ(rbuf.ssi_signo, signo);
 
     // Wait for the other thread.
     absl::MutexLock ml(&mu);
@@ -185,7 +192,7 @@ TEST(Signalfd, ThreadGroup) {
   });
 
   // Deliver the signal to the threadgroup.
-  ASSERT_THAT(kill(getpid(), kSigno), SyscallSucceeds());
+  ASSERT_THAT(kill(getpid(), signo), SyscallSucceeds());
 
   // Wait for the first thread to process.
   {
@@ -194,13 +201,13 @@ TEST(Signalfd, ThreadGroup) {
   }
 
   // Deliver to the thread group again (other thread still exists).
-  ASSERT_THAT(kill(getpid(), kSigno), SyscallSucceeds());
+  ASSERT_THAT(kill(getpid(), signo), SyscallSucceeds());
 
   // Ensure that we can also receive it.
   struct signalfd_siginfo rbuf;
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 
   // Mark the test as done.
   {
@@ -212,11 +219,12 @@ TEST(Signalfd, ThreadGroup) {
   t.Join();
 }
 
-TEST(Signalfd, Nonblock) {
+TEST_P(SignalfdTest, Nonblock) {
+  int signo = GetParam();
   // Create the signalfd in non-blocking mode.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_NONBLOCK));
 
@@ -227,20 +235,21 @@ TEST(Signalfd, Nonblock) {
 
   // Block and deliver the signal.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
 
   // Ensure that a read actually works.
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 
   // Should block again.
   EXPECT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallFailsWithErrno(EWOULDBLOCK));
 }
 
-TEST(Signalfd, SetMask) {
+TEST_P(SignalfdTest, SetMask) {
+  int signo = GetParam();
   // Create the signalfd matching nothing.
   sigset_t mask;
   sigemptyset(&mask);
@@ -249,8 +258,8 @@ TEST(Signalfd, SetMask) {
 
   // Block and deliver a signal.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
-  ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+  ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
 
   // We should have nothing.
   struct signalfd_siginfo rbuf;
@@ -258,29 +267,30 @@ TEST(Signalfd, SetMask) {
               SyscallFailsWithErrno(EWOULDBLOCK));
 
   // Change the signal mask.
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   ASSERT_THAT(signalfd(fd.get(), &mask, 0), SyscallSucceeds());
 
   // We should now have the signal.
   ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
               SyscallSucceedsWithValue(sizeof(rbuf)));
-  EXPECT_EQ(rbuf.ssi_signo, kSigno);
+  EXPECT_EQ(rbuf.ssi_signo, signo);
 }
 
-TEST(Signalfd, Poll) {
+TEST_P(SignalfdTest, Poll) {
+  int signo = GetParam();
   // Create the signalfd.
   sigset_t mask;
   sigemptyset(&mask);
-  sigaddset(&mask, kSigno);
+  sigaddset(&mask, signo);
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
 
   // Block the signal, and start a thread to deliver it.
   const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSigno));
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
   pid_t orig_tid = gettid();
   ScopedThread t([&] {
     absl::SleepFor(absl::Seconds(5));
-    ASSERT_THAT(tgkill(getpid(), orig_tid, kSigno), SyscallSucceeds());
+    ASSERT_THAT(tgkill(getpid(), orig_tid, signo), SyscallSucceeds());
   });
 
   // Start polling for the signal. We expect that it is not available at the
@@ -297,19 +307,18 @@ TEST(Signalfd, Poll) {
               SyscallSucceedsWithValue(sizeof(rbuf)));
 }
 
-TEST(Signalfd, KillStillKills) {
-  sigset_t mask;
-  sigemptyset(&mask);
-  sigaddset(&mask, SIGKILL);
-  FileDescriptor fd =
-      ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_CLOEXEC));
-
-  // Just because there is a signalfd, we shouldn't see any change in behavior
-  // for unblockable signals. It's easier to test this with SIGKILL.
-  const auto scoped_sigmask =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGKILL));
-  EXPECT_EXIT(tgkill(getpid(), gettid(), SIGKILL), KilledBySignal(SIGKILL), "");
+std::string PrintSigno(::testing::TestParamInfo<int> info) {
+  switch (info.param) {
+    case kSigno:
+      return "kSigno";
+    case kSignoMax:
+      return "kSignoMax";
+    default:
+      return absl::StrCat(info.param);
+  }
 }
+INSTANTIATE_TEST_SUITE_P(Signalfd, SignalfdTest,
+                         ::testing::Values(kSigno, kSignoMax), PrintSigno);
 
 TEST(Signalfd, Ppoll) {
   sigset_t mask;
@@ -328,6 +337,20 @@ TEST(Signalfd, Ppoll) {
               SyscallSucceedsWithValue(0));
 }
 
+TEST(Signalfd, KillStillKills) {
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGKILL);
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_CLOEXEC));
+
+  // Just because there is a signalfd, we shouldn't see any change in behavior
+  // for unblockable signals. It's easier to test this with SIGKILL.
+  const auto scoped_sigmask =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGKILL));
+  EXPECT_EXIT(tgkill(getpid(), gettid(), SIGKILL), KilledBySignal(SIGKILL), "");
+}
+
 }  // namespace
 
 }  // namespace testing
@@ -340,6 +363,7 @@ int main(int argc, char** argv) {
   sigset_t set;
   sigemptyset(&set);
   sigaddset(&set, gvisor::testing::kSigno);
+  sigaddset(&set, gvisor::testing::kSignoMax);
   sigaddset(&set, gvisor::testing::kSignoAlt);
   TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
 
-- 
cgit v1.2.3


From 2ba6198851dc1e293295d7cadf8c0ae456b68beb Mon Sep 17 00:00:00 2001
From: Dean Deng <deandeng@google.com>
Date: Tue, 21 Jan 2020 12:41:50 -0800
Subject: Add syscalls for lgetxattr, fgetxattr, lsetxattr, and fsetxattr.

Note that these simply will use the same logic as getxattr and setxattr, which
is not yet implemented for most filesystems.

PiperOrigin-RevId: 290800960
---
 pkg/sentry/syscalls/linux/linux64_amd64.go |   8 +-
 pkg/sentry/syscalls/linux/linux64_arm64.go |   8 +-
 pkg/sentry/syscalls/linux/sys_xattr.go     | 136 +++++++++++++++++++++--------
 test/syscalls/linux/xattr.cc               |  41 +++++++++
 4 files changed, 150 insertions(+), 43 deletions(-)

(limited to 'test/syscalls/linux')

diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 6b2920900..c76771a54 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -229,11 +229,11 @@ var AMD64 = &kernel.SyscallTable{
 		186: syscalls.Supported("gettid", Gettid),
 		187: syscalls.Supported("readahead", Readahead),
 		188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
 		191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
 		194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index c9629f6f3..d3587fda6 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -42,11 +42,11 @@ var ARM64 = &kernel.SyscallTable{
 		3:   syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		4:   syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
 		5:   syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil),
-		6:   syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		7:   syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		6:   syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil),
+		7:   syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil),
 		8:   syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil),
-		9:   syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
-		10:  syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+		9:   syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil),
+		10:  syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil),
 		11:  syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		12:  syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
 		13:  syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 23d20da6f..e35c077d6 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -27,6 +27,40 @@ import (
 
 // GetXattr implements linux syscall getxattr(2).
 func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getXattrFromPath(t, args, true)
+}
+
+// LGetXattr implements linux syscall lgetxattr(2).
+func LGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return getXattrFromPath(t, args, false)
+}
+
+// FGetXattr implements linux syscall fgetxattr(2).
+func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := uint64(args[3].SizeT())
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	n, value, err := getXattr(t, f.Dirent, nameAddr, size)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if _, err := t.CopyOutBytes(valueAddr, []byte(value)); err != nil {
+		return 0, nil, err
+	}
+	return uintptr(n), nil, nil
+}
+
+func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
 	pathAddr := args[0].Pointer()
 	nameAddr := args[1].Pointer()
 	valueAddr := args[2].Pointer()
@@ -38,29 +72,17 @@ func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	valueLen := 0
-	err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
-		// If getxattr(2) is called with size 0, the size of the value will be
-		// returned successfully even if it is nonzero. In that case, we need to
-		// retrieve the entire attribute value so we can return the correct size.
-		requestedSize := size
-		if size == 0 || size > linux.XATTR_SIZE_MAX {
-			requestedSize = linux.XATTR_SIZE_MAX
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
 		}
 
-		value, err := getXattr(t, d, dirPath, nameAddr, uint64(requestedSize))
+		n, value, err := getXattr(t, d, nameAddr, size)
+		valueLen = n
 		if err != nil {
 			return err
 		}
 
-		valueLen = len(value)
-		if uint64(valueLen) > requestedSize {
-			return syserror.ERANGE
-		}
-
-		// Skip copying out the attribute value if size is 0.
-		if size == 0 {
-			return nil
-		}
 		_, err = t.CopyOutBytes(valueAddr, []byte(value))
 		return err
 	})
@@ -71,29 +93,73 @@ func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 }
 
 // getXattr implements getxattr(2) from the given *fs.Dirent.
-func getXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr, size uint64) (string, error) {
-	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
-		return "", syserror.ENOTDIR
-	}
-
+func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64) (int, string, error) {
 	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
-		return "", err
+		return 0, "", err
 	}
 
 	name, err := copyInXattrName(t, nameAddr)
 	if err != nil {
-		return "", err
+		return 0, "", err
 	}
 
 	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
-		return "", syserror.EOPNOTSUPP
+		return 0, "", syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.GetXattr(t, name, size)
+	// If getxattr(2) is called with size 0, the size of the value will be
+	// returned successfully even if it is nonzero. In that case, we need to
+	// retrieve the entire attribute value so we can return the correct size.
+	requestedSize := size
+	if size == 0 || size > linux.XATTR_SIZE_MAX {
+		requestedSize = linux.XATTR_SIZE_MAX
+	}
+
+	value, err := d.Inode.GetXattr(t, name, requestedSize)
+	if err != nil {
+		return 0, "", err
+	}
+	n := len(value)
+	if uint64(n) > requestedSize {
+		return 0, "", syserror.ERANGE
+	}
+
+	// Don't copy out the attribute value if size is 0.
+	if size == 0 {
+		return n, "", nil
+	}
+	return n, value, nil
 }
 
 // SetXattr implements linux syscall setxattr(2).
 func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return setXattrFromPath(t, args, true)
+}
+
+// LSetXattr implements linux syscall lsetxattr(2).
+func LSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return setXattrFromPath(t, args, false)
+}
+
+// FSetXattr implements linux syscall fsetxattr(2).
+func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	nameAddr := args[1].Pointer()
+	valueAddr := args[2].Pointer()
+	size := uint64(args[3].SizeT())
+	flags := args[4].Uint()
+
+	// TODO(b/113957122): Return EBADF if the fd was opened with O_PATH.
+	f := t.GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags)
+}
+
+func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) {
 	pathAddr := args[0].Pointer()
 	nameAddr := args[1].Pointer()
 	valueAddr := args[2].Pointer()
@@ -105,19 +171,19 @@ func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		return 0, nil, err
 	}
 
-	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
-		return 0, nil, syserror.EINVAL
-	}
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+		if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
 
-	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
-		return setXattr(t, d, dirPath, nameAddr, valueAddr, uint64(size), flags)
+		return setXattr(t, d, nameAddr, valueAddr, uint64(size), flags)
 	})
 }
 
 // setXattr implements setxattr(2) from the given *fs.Dirent.
-func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error {
-	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
-		return syserror.ENOTDIR
+func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64, flags uint32) error {
+	if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+		return syserror.EINVAL
 	}
 
 	if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
@@ -133,7 +199,7 @@ func setXattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr us
 		return syserror.E2BIG
 	}
 	buf := make([]byte, size)
-	if _, err = t.CopyInBytes(valueAddr, buf); err != nil {
+	if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
 		return err
 	}
 	value := string(buf)
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
index b3bc3463e..e77c355d7 100644
--- a/test/syscalls/linux/xattr.cc
+++ b/test/syscalls/linux/xattr.cc
@@ -26,6 +26,7 @@
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/file_base.h"
 #include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -414,6 +415,46 @@ TEST_F(XattrTest, GetxattrNonexistentName) {
   EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
 }
 
+TEST_F(XattrTest, LGetSetxattrOnSymlink) {
+  TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
+
+  EXPECT_THAT(lsetxattr(link.path().c_str(), nullptr, nullptr, 0, 0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(lgetxattr(link.path().c_str(), nullptr, nullptr, 0),
+              SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, LGetSetxattrOnNonsymlink) {
+  const char* path = test_file_name_.c_str();
+  const char name[] = "user.test";
+  int val = 1234;
+  size_t size = sizeof(val);
+  EXPECT_THAT(lsetxattr(path, name, &val, size, /*flags=*/0),
+              SyscallSucceeds());
+
+  int buf = 0;
+  EXPECT_THAT(lgetxattr(path, name, &buf, size),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, FGetSetxattr) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
+  const char name[] = "user.test";
+  int val = 1234;
+  size_t size = sizeof(val);
+  EXPECT_THAT(fsetxattr(fd.get(), name, &val, size, /*flags=*/0),
+              SyscallSucceeds());
+
+  int buf = 0;
+  EXPECT_THAT(fgetxattr(fd.get(), name, &buf, size),
+              SyscallSucceedsWithValue(size));
+  EXPECT_EQ(buf, val);
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3