Fix listen backlog handling to be in parity with Linux

- Change the accept queue full condition for a listening endpoint to only honor completed (and delivered) connections. - Use syncookies if the number of incomplete connections is beyond listen backlog. This also cleans up the SynThreshold option code as that is no longer used with this change. - Added a new stack option to unconditionally generate syncookies. Similar to sysctl -w net.ipv4.tcp_syncookies=2 on Linux. - Enable keeping of incomplete connections beyond listen backlog. - Drop incoming SYNs only if the accept queue is filled up. - Drop incoming ACKs that complete handshakes when accept queue is full - Enable the stack to accept one more connection than programmed by listen backlog. - Handle backlog argument being zero, negative for listen, as Linux. - Add syscall and packetimpact tests to reflect the changes above. - Remove TCPConnectBacklog test which is polling for completed connections on the client side which is not reflective of whether the accept queue is filled up by the test. The modified syscall test in this CL addresses testing of connecting sockets. Fixes #3153 PiperOrigin-RevId: 366935921
author: Mithun Iyer <iyerm@google.com> 2021-04-05 21:51:31 -0700
committer: gVisor bot <gvisor-bot@google.com> 2021-04-05 21:53:41 -0700
commit: 56c69fb0e7f96e5bd5da9e0d29a78e05dd3e2bba (patch)
tree: c231730c05bf555753b19577edf05e7235a9569e /test
parent: 7a7fcf2dbaa7bdcdb9b523358de91c71d5cb05d8 (diff)
5 files changed, 394 insertions, 122 deletions
diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl
index 34e83ec49..634c15727 100644
--- a/test/packetimpact/runner/defs.bzl
+++ b/test/packetimpact/runner/defs.bzl
@@ -246,6 +246,12 @@ ALL_TESTS = [
         expect_netstack_failure = True,
     ),
     PacketimpactTestInfo(
+        name = "tcp_listen_backlog",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_syncookie",
+    ),
+    PacketimpactTestInfo(
         name = "icmpv6_param_problem",
     ),
     PacketimpactTestInfo(
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 92103c1e9..83ff70951 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -385,6 +385,26 @@ packetimpact_testbench(
     ],
 )
 
+packetimpact_testbench(
+    name = "tcp_listen_backlog",
+    srcs = ["tcp_listen_backlog_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_testbench(
+    name = "tcp_syncookie",
+    srcs = ["tcp_syncookie_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
 validate_all_tests()
 
 [packetimpact_go_test(
diff --git a/test/packetimpact/tests/tcp_listen_backlog_test.go b/test/packetimpact/tests/tcp_listen_backlog_test.go
new file mode 100644
index 000000000..26c812d0a
--- /dev/null
+++ b/test/packetimpact/tests/tcp_listen_backlog_test.go
@@ -0,0 +1,86 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_listen_backlog_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.Initialize(flag.CommandLine)
+}
+
+// TestTCPListenBacklog tests for a listening endpoint behavior:
+// (1) reply to more SYNs than what is configured as listen backlog
+// (2) ignore ACKs (that complete a handshake) when the accept queue is full
+// (3) ignore incoming SYNs when the accept queue is full
+func TestTCPListenBacklog(t *testing.T) {
+	dut := testbench.NewDUT(t)
+
+	// Listening endpoint accepts one more connection than the listen backlog.
+	listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 0 /*backlog*/)
+
+	var establishedConn testbench.TCPIPv4
+	var incompleteConn testbench.TCPIPv4
+
+	// Test if the DUT listener replies to more SYNs than listen backlog+1
+	for i, conn := range []*testbench.TCPIPv4{&establishedConn, &incompleteConn} {
+		*conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+		// Expect dut connection to have transitioned to SYN-RCVD state.
+		conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)})
+		if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+			t.Fatalf("expected SYN-ACK for %d connection, %s", i, err)
+		}
+	}
+	defer establishedConn.Close(t)
+	defer incompleteConn.Close(t)
+
+	// Send the ACK to complete handshake.
+	establishedConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)})
+	dut.PollOne(t, listenFd, unix.POLLIN, time.Second)
+
+	// Send the ACK to complete handshake, expect this to be ignored by the
+	// listener.
+	incompleteConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)})
+
+	// Drain the accept queue to enable poll for subsequent connections on the
+	// listener.
+	dut.Accept(t, listenFd)
+
+	// The ACK for the incomplete connection should be ignored by the
+	// listening endpoint and the poll on listener should now time out.
+	if pfds := dut.Poll(t, []unix.PollFd{{Fd: listenFd, Events: unix.POLLIN}}, time.Second); len(pfds) != 0 {
+		t.Fatalf("got dut.Poll(...) = %#v", pfds)
+	}
+
+	// Re-send the ACK to complete handshake and re-fill the accept-queue.
+	incompleteConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)})
+	dut.PollOne(t, listenFd, unix.POLLIN, time.Second)
+
+	// Now initiate a new connection when the accept queue is full.
+	connectingConn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer connectingConn.Close(t)
+	// Expect dut connection to drop the SYN and let the client stay in SYN_SENT state.
+	connectingConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)})
+	if got, err := connectingConn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err == nil {
+		t.Fatalf("expected no SYN-ACK, but got %s", got)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_syncookie_test.go b/test/packetimpact/tests/tcp_syncookie_test.go
new file mode 100644
index 000000000..1c21c62ff
--- /dev/null
+++ b/test/packetimpact/tests/tcp_syncookie_test.go
@@ -0,0 +1,70 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_syncookie_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.Initialize(flag.CommandLine)
+}
+
+// TestSynCookie test if the DUT listener is replying back using syn cookies.
+// The test does not complete the handshake by not sending the ACK to SYNACK.
+// When syncookies are not used, this forces the listener to retransmit SYNACK.
+// And when syncookies are being used, there is no such retransmit.
+func TestTCPSynCookie(t *testing.T) {
+	dut := testbench.NewDUT(t)
+
+	// Listening endpoint accepts one more connection than the listen backlog.
+	_, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/)
+
+	var withoutSynCookieConn testbench.TCPIPv4
+	var withSynCookieConn testbench.TCPIPv4
+
+	// Test if the DUT listener replies to more SYNs than listen backlog+1
+	for _, conn := range []*testbench.TCPIPv4{&withoutSynCookieConn, &withSynCookieConn} {
+		*conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	}
+	defer withoutSynCookieConn.Close(t)
+	defer withSynCookieConn.Close(t)
+
+	checkSynAck := func(t *testing.T, conn *testbench.TCPIPv4, expectRetransmit bool) {
+		// Expect dut connection to have transitioned to SYN-RCVD state.
+		conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)})
+		if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+			t.Fatalf("expected SYN-ACK, but got %s", err)
+		}
+
+		// If the DUT listener is using syn cookies, it will not retransmit SYNACK
+		got, err := conn.ExpectData(t, &testbench.TCP{SeqNum: testbench.Uint32(uint32(*conn.RemoteSeqNum(t) - 1)), Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, 2*time.Second)
+		if expectRetransmit && err != nil {
+			t.Fatalf("expected retransmitted SYN-ACK, but got %s", err)
+		}
+		if !expectRetransmit && err == nil {
+			t.Fatalf("expected no retransmitted SYN-ACK, but got %s", got)
+		}
+	}
+
+	t.Run("without syncookies", func(t *testing.T) { checkSynAck(t, &withoutSynCookieConn, true /*expectRetransmit*/) })
+	t.Run("with syncookies", func(t *testing.T) { checkSynAck(t, &withSynCookieConn, false /*expectRetransmit*/) })
+}
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 597b5bcb1..d391363fb 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -489,13 +489,6 @@ void TestListenWhileConnect(const TestParam& param,
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
-  constexpr int kBacklog = 2;
-  // Linux completes one more connection than the listen backlog argument.
-  // To ensure that there is at least one client connection that stays in
-  // connecting state, keep 2 more client connections than the listen backlog.
-  // gVisor differs in this behavior though, gvisor.dev/issue/3153.
-  constexpr int kClients = kBacklog + 2;
-
   // Create the listening socket.
   FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
@@ -503,6 +496,13 @@ void TestListenWhileConnect(const TestParam& param,
   ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
                    listener.addr_len),
               SyscallSucceeds());
+  // This test is only interested in deterministically getting a socket in
+  // connecting state. For that, we use a listen backlog of zero which would
+  // mean there is exactly one connection that gets established and is enqueued
+  // to the accept queue. We poll on the listener to ensure that is enqueued.
+  // After that the subsequent client connect will stay in connecting state as
+  // the accept queue is full.
+  constexpr int kBacklog = 0;
   ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
@@ -515,42 +515,49 @@ void TestListenWhileConnect(const TestParam& param,
 
   sockaddr_storage conn_addr = connector.addr;
   ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
-  std::vector<FileDescriptor> clients;
-  for (int i = 0; i < kClients; i++) {
-    FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
-        Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
-    int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
-                      connector.addr_len);
-    if (ret != 0) {
-      EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
-      clients.push_back(std::move(client));
-    }
+  FileDescriptor established_client = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(
+      connect(established_client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+              connector.addr_len),
+      SyscallSucceeds());
+
+  // Ensure that the accept queue has the completed connection.
+  constexpr int kTimeout = 10000;
+  pollfd pfd = {
+      .fd = listen_fd.get(),
+      .events = POLLIN,
+  };
+  ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+  ASSERT_EQ(pfd.revents, POLLIN);
+
+  FileDescriptor connecting_client = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+  // Keep the last client in connecting state.
+  int ret =
+      connect(connecting_client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+              connector.addr_len);
+  if (ret != 0) {
+    EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
   }
 
   stopListen(listen_fd);
 
-  for (auto& client : clients) {
-    constexpr int kTimeout = 10000;
+  std::array<std::pair<int, int>, 2> sockets = {
+      std::make_pair(established_client.get(), ECONNRESET),
+      std::make_pair(connecting_client.get(), ECONNREFUSED),
+  };
+  for (size_t i = 0; i < sockets.size(); i++) {
+    SCOPED_TRACE(absl::StrCat("i=", i));
+    auto [fd, expected_errno] = sockets[i];
     pollfd pfd = {
-        .fd = client.get(),
-        .events = POLLIN,
+        .fd = fd,
     };
-    // When the listening socket is closed, then we expect the remote to reset
-    // the connection.
-    ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
-    ASSERT_EQ(pfd.revents, POLLIN | POLLHUP | POLLERR);
+    // When the listening socket is closed, the peer would reset the connection.
+    EXPECT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+    EXPECT_EQ(pfd.revents, POLLHUP | POLLERR);
     char c;
-    // Subsequent read can fail with:
-    // ECONNRESET: If the client connection was established and was reset by the
-    // remote.
-    // ECONNREFUSED: If the client connection failed to be established.
-    ASSERT_THAT(read(client.get(), &c, sizeof(c)),
-                AnyOf(SyscallFailsWithErrno(ECONNRESET),
-                      SyscallFailsWithErrno(ECONNREFUSED)));
-    // The last client connection would be in connecting (SYN_SENT) state.
-    if (client.get() == clients[kClients - 1].get()) {
-      ASSERT_EQ(errno, ECONNREFUSED) << strerror(errno);
-    }
+    EXPECT_THAT(read(fd, &c, sizeof(c)), SyscallFailsWithErrno(expected_errno));
   }
 }
 
@@ -570,7 +577,59 @@ TEST_P(SocketInetLoopbackTest, TCPListenShutdownWhileConnect) {
 // random save as established connections which can't be delivered to the accept
 // queue because the queue is full are not correctly delivered after restore
 // causing the last accept to timeout on the restore.
-TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) {
+TEST_P(SocketInetLoopbackTest, TCPAcceptBacklogSizes_NoRandomSave) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+  std::array<int, 3> backlogs = {-1, 0, 1};
+  for (auto& backlog : backlogs) {
+    ASSERT_THAT(listen(listen_fd.get(), backlog), SyscallSucceeds());
+
+    int expected_accepts;
+    if (backlog < 0) {
+      expected_accepts = 1024;
+    } else {
+      expected_accepts = backlog + 1;
+    }
+    for (int i = 0; i < expected_accepts; i++) {
+      SCOPED_TRACE(absl::StrCat("i=", i));
+      // Connect to the listening socket.
+      const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+          Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+      sockaddr_storage conn_addr = connector.addr;
+      ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+      ASSERT_THAT(
+          RetryEINTR(connect)(conn_fd.get(),
+                              reinterpret_cast<struct sockaddr*>(&conn_addr),
+                              connector.addr_len),
+          SyscallSucceeds());
+      const FileDescriptor accepted =
+          ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+    }
+  }
+}
+
+// TODO(b/157236388): Remove _NoRandomSave once bug is fixed. Test fails w/
+// random save as established connections which can't be delivered to the accept
+// queue because the queue is full are not correctly delivered after restore
+// causing the last accept to timeout on the restore.
+TEST_P(SocketInetLoopbackTest, TCPBacklog_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -595,6 +654,7 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) {
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
   int i = 0;
   while (1) {
+    SCOPED_TRACE(absl::StrCat("i=", i));
     int ret;
 
     // Connect to the listening socket.
@@ -620,103 +680,133 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) {
     i++;
   }
 
+  int client_conns = i;
+  int accepted_conns = 0;
   for (; i != 0; i--) {
-    // Accept the connection.
-    //
-    // We have to assign a name to the accepted socket, as unamed temporary
-    // objects are destructed upon full evaluation of the expression it is in,
-    // potentially causing the connecting socket to fail to shutdown properly.
-    auto accepted =
-        ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+    SCOPED_TRACE(absl::StrCat("i=", i));
+    pollfd pfd = {
+        .fd = listen_fd.get(),
+        .events = POLLIN,
+    };
+    // Look for incoming connections to accept. The last connect request could
+    // be established from the client side, but the ACK of the handshake could
+    // be dropped by the listener if the accept queue was filled up by the
+    // previous connect.
+    int ret;
+    ASSERT_THAT(ret = poll(&pfd, 1, 3000), SyscallSucceeds());
+    if (ret == 0) break;
+    if (pfd.revents == POLLIN) {
+      // Accept the connection.
+      //
+      // We have to assign a name to the accepted socket, as unamed temporary
+      // objects are destructed upon full evaluation of the expression it is in,
+      // potentially causing the connecting socket to fail to shutdown properly.
+      auto accepted =
+          ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+      accepted_conns++;
+    }
   }
+  // We should accept at least listen backlog + 1 connections. As the stack is
+  // enqueuing established connections to the accept queue, newer SYNs could
+  // still be replied to causing those client connections would be accepted as
+  // we start dequeuing the queue.
+  ASSERT_GE(accepted_conns, kBacklogSize + 1);
+  ASSERT_GE(client_conns, accepted_conns);
 }
 
-// Test if the stack completes atmost listen backlog number of client
-// connections. It exercises the path of the stack that enqueues completed
-// connections to accept queue vs new incoming SYNs.
-TEST_P(SocketInetLoopbackTest, TCPConnectBacklog_NoRandomSave) {
-  const auto& param = GetParam();
-  const TestAddress& listener = param.listener;
-  const TestAddress& connector = param.connector;
+// TODO(b/157236388): Remove _NoRandomSave once bug is fixed. Test fails w/
+// random save as established connections which can't be delivered to the accept
+// queue because the queue is full are not correctly delivered after restore
+// causing the last accept to timeout on the restore.
+TEST_P(SocketInetLoopbackTest, TCPBacklogAcceptAll_NoRandomSave) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
 
+  // Create the listening socket.
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
   constexpr int kBacklog = 1;
-  // Keep the number of client connections more than the listen backlog.
-  // Linux completes one more connection than the listen backlog argument.
-  // gVisor differs in this behavior though, gvisor.dev/issue/3153.
-  int kClients = kBacklog + 2;
-  if (IsRunningOnGvisor()) {
-    kClients--;
-  }
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
 
-  // Run the following test for few iterations to test race between accept queue
-  // getting filled with incoming SYNs.
-  for (int num = 0; num < 10; num++) {
-    FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
-        Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
-    sockaddr_storage listen_addr = listener.addr;
-    ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
-                     listener.addr_len),
-                SyscallSucceeds());
-    ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
-    socklen_t addrlen = listener.addr_len;
-    ASSERT_THAT(
-        getsockname(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
-                    &addrlen),
-        SyscallSucceeds());
-    uint16_t const port =
-        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
-    sockaddr_storage conn_addr = connector.addr;
-    ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
 
-    std::vector<FileDescriptor> clients;
-    // Issue multiple non-blocking client connects.
-    for (int i = 0; i < kClients; i++) {
-      FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
-          Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
-      int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
-                        connector.addr_len);
-      if (ret != 0) {
-        EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
-      }
-      clients.push_back(std::move(client));
+  // Fill up the accept queue and trigger more client connections which would be
+  // waiting to be accepted.
+  std::array<FileDescriptor, kBacklog + 1> established_clients;
+  for (auto& fd : established_clients) {
+    fd = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(connect(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                        connector.addr_len),
+                SyscallSucceeds());
+  }
+  std::array<FileDescriptor, kBacklog> waiting_clients;
+  for (auto& fd : waiting_clients) {
+    fd = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+    int ret = connect(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                      connector.addr_len);
+    if (ret != 0) {
+      EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
     }
+  }
 
-    // Now that client connects are issued, wait for the accept queue to get
-    // filled and ensure no new client connection is completed.
-    for (int i = 0; i < kClients; i++) {
-      pollfd pfd = {
-          .fd = clients[i].get(),
-          .events = POLLOUT,
-      };
-      if (i < kClients - 1) {
-        // Poll for client side connection completions with a large timeout.
-        // We cannot poll on the listener side without calling accept as poll
-        // stays level triggered with non-zero accept queue length.
-        //
-        // Client side poll would not guarantee that the completed connection
-        // has been enqueued in to the acccept queue, but the fact that the
-        // listener ACKd the SYN, means that it cannot complete any new incoming
-        // SYNs when it has already ACKd for > backlog number of SYNs.
-        ASSERT_THAT(poll(&pfd, 1, 10000), SyscallSucceedsWithValue(1))
-            << "num=" << num << " i=" << i << " kClients=" << kClients;
-        ASSERT_EQ(pfd.revents, POLLOUT) << "num=" << num << " i=" << i;
-      } else {
-        // Now that we expect accept queue filled up, ensure that the last
-        // client connection never completes with a smaller poll timeout.
-        ASSERT_THAT(poll(&pfd, 1, 1000), SyscallSucceedsWithValue(0))
-            << "num=" << num << " i=" << i;
-      }
+  auto accept_connection = [&]() {
+    constexpr int kTimeout = 10000;
+    pollfd pfd = {
+        .fd = listen_fd.get(),
+        .events = POLLIN,
+    };
+    ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+    ASSERT_EQ(pfd.revents, POLLIN);
+    // Accept the connection.
+    //
+    // We have to assign a name to the accepted socket, as unamed temporary
+    // objects are destructed upon full evaluation of the expression it is in,
+    // potentially causing the connecting socket to fail to shutdown properly.
+    auto accepted =
+        ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+  };
 
-      ASSERT_THAT(close(clients[i].release()), SyscallSucceedsWithValue(0))
-          << "num=" << num << " i=" << i;
-    }
-    clients.clear();
-    // We close the listening side and open a new listener. We could instead
-    // drain the accept queue by calling accept() and reuse the listener, but
-    // that is racy as the retransmitted SYNs could get ACKd as we make room in
-    // the accept queue.
-    ASSERT_THAT(close(listen_fd.release()), SyscallSucceedsWithValue(0));
+  // Ensure that we accept all client connections. The waiting connections would
+  // get enqueued as we drain the accept queue.
+  for (int i = 0; i < std::size(established_clients); i++) {
+    SCOPED_TRACE(absl::StrCat("established clients i=", i));
+    accept_connection();
+  }
+
+  // The waiting client connections could be in one of these 2 states:
+  // (1) SYN_SENT: if the SYN was dropped because accept queue was full
+  // (2) ESTABLISHED: if the listener sent back a SYNACK, but may have dropped
+  // the ACK from the client if the accept queue was full (send out a data to
+  // re-send that ACK, to address that case).
+  for (int i = 0; i < std::size(waiting_clients); i++) {
+    SCOPED_TRACE(absl::StrCat("waiting clients i=", i));
+    constexpr int kTimeout = 10000;
+    pollfd pfd = {
+        .fd = waiting_clients[i].get(),
+        .events = POLLOUT,
+    };
+    EXPECT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+    EXPECT_EQ(pfd.revents, POLLOUT);
+    char c;
+    EXPECT_THAT(RetryEINTR(send)(waiting_clients[i].get(), &c, sizeof(c), 0),
+                SyscallSucceedsWithValue(sizeof(c)));
+    accept_connection();
   }
 }
author	Mithun Iyer <iyerm@google.com>	2021-04-05 21:51:31 -0700
committer	gVisor bot <gvisor-bot@google.com>	2021-04-05 21:53:41 -0700
commit	56c69fb0e7f96e5bd5da9e0d29a78e05dd3e2bba (patch)
tree	c231730c05bf555753b19577edf05e7235a9569e /test
parent	7a7fcf2dbaa7bdcdb9b523358de91c71d5cb05d8 (diff)