Fix listen backlog handling to be in parity with Linux

- Change the accept queue full condition for a listening endpoint to only honor completed (and delivered) connections. - Use syncookies if the number of incomplete connections is beyond listen backlog. This also cleans up the SynThreshold option code as that is no longer used with this change. - Added a new stack option to unconditionally generate syncookies. Similar to sysctl -w net.ipv4.tcp_syncookies=2 on Linux. - Enable keeping of incomplete connections beyond listen backlog. - Drop incoming SYNs only if the accept queue is filled up. - Drop incoming ACKs that complete handshakes when accept queue is full - Enable the stack to accept one more connection than programmed by listen backlog. - Handle backlog argument being zero, negative for listen, as Linux. - Add syscall and packetimpact tests to reflect the changes above. - Remove TCPConnectBacklog test which is polling for completed connections on the client side which is not reflective of whether the accept queue is filled up by the test. The modified syscall test in this CL addresses testing of connecting sockets. Fixes #3153 PiperOrigin-RevId: 366935921
author: Mithun Iyer <iyerm@google.com> 2021-04-05 21:51:31 -0700
committer: gVisor bot <gvisor-bot@google.com> 2021-04-05 21:53:41 -0700
commit: 56c69fb0e7f96e5bd5da9e0d29a78e05dd3e2bba (patch)
tree: c231730c05bf555753b19577edf05e7235a9569e
parent: 7a7fcf2dbaa7bdcdb9b523358de91c71d5cb05d8 (diff)
16 files changed, 527 insertions, 360 deletions
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 9bdf6d3d8..0141e8a96 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -35,12 +35,6 @@ import (
 
 // LINT.IfChange
 
-// minListenBacklog is the minimum reasonable backlog for listening sockets.
-const minListenBacklog = 8
-
-// maxListenBacklog is the maximum allowed backlog for listening sockets.
-const maxListenBacklog = 1024
-
 // maxAddrLen is the maximum socket address length we're willing to accept.
 const maxAddrLen = 200
 
@@ -382,14 +376,6 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		return 0, nil, syserror.ENOTSOCK
 	}
 
-	// Per Linux, the backlog is silently capped to reasonable values.
-	if backlog <= 0 {
-		backlog = minListenBacklog
-	}
-	if backlog > maxListenBacklog {
-		backlog = maxListenBacklog
-	}
-
 	return 0, nil, s.Listen(t, int(backlog)).ToError()
 }
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index a87a66146..7cc0be892 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -35,12 +35,6 @@ import (
 	"gvisor.dev/gvisor/pkg/hostarch"
 )
 
-// minListenBacklog is the minimum reasonable backlog for listening sockets.
-const minListenBacklog = 8
-
-// maxListenBacklog is the maximum allowed backlog for listening sockets.
-const maxListenBacklog = 1024
-
 // maxAddrLen is the maximum socket address length we're willing to accept.
 const maxAddrLen = 200
 
@@ -386,14 +380,6 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		return 0, nil, syserror.ENOTSOCK
 	}
 
-	// Per Linux, the backlog is silently capped to reasonable values.
-	if backlog <= 0 {
-		backlog = minListenBacklog
-	}
-	if backlog > maxListenBacklog {
-		backlog = maxListenBacklog
-	}
-
 	return 0, nil, s.Listen(t, int(backlog)).ToError()
 }
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 87ea09a5e..60de16579 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -786,6 +786,13 @@ func (*TCPRecovery) isGettableTransportProtocolOption() {}
 
 func (*TCPRecovery) isSettableTransportProtocolOption() {}
 
+// TCPAlwaysUseSynCookies indicates unconditional usage of syncookies.
+type TCPAlwaysUseSynCookies bool
+
+func (*TCPAlwaysUseSynCookies) isGettableTransportProtocolOption() {}
+
+func (*TCPAlwaysUseSynCookies) isSettableTransportProtocolOption() {}
+
 const (
 	// TCPRACKLossDetection indicates RACK is used for loss detection and
 	// recovery.
@@ -1020,19 +1027,6 @@ func (*TCPMaxRetriesOption) isGettableTransportProtocolOption() {}
 
 func (*TCPMaxRetriesOption) isSettableTransportProtocolOption() {}
 
-// TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify
-// the number of endpoints that can be in SYN-RCVD state before the stack
-// switches to using SYN cookies.
-type TCPSynRcvdCountThresholdOption uint64
-
-func (*TCPSynRcvdCountThresholdOption) isGettableSocketOption() {}
-
-func (*TCPSynRcvdCountThresholdOption) isSettableSocketOption() {}
-
-func (*TCPSynRcvdCountThresholdOption) isGettableTransportProtocolOption() {}
-
-func (*TCPSynRcvdCountThresholdOption) isSettableTransportProtocolOption() {}
-
 // TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide
 // default for number of times SYN is retransmitted before aborting a connect.
 type TCPSynRetriesOption uint8
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 025b134e2..7372ebc08 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -51,11 +51,6 @@ const (
 	// timestamp and the current timestamp. If the difference is greater
 	// than maxTSDiff, the cookie is expired.
 	maxTSDiff = 2
-
-	// SynRcvdCountThreshold is the default global maximum number of
-	// connections that are allowed to be in SYN-RCVD state before TCP
-	// starts using SYN cookies to accept connections.
-	SynRcvdCountThreshold uint64 = 1000
 )
 
 var (
@@ -80,9 +75,6 @@ func encodeMSS(mss uint16) uint32 {
 type listenContext struct {
 	stack *stack.Stack
 
-	// synRcvdCount is a reference to the stack level synRcvdCount.
-	synRcvdCount *synRcvdCounter
-
 	// rcvWnd is the receive window that is sent by this listening context
 	// in the initial SYN-ACK.
 	rcvWnd seqnum.Size
@@ -138,11 +130,6 @@ func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size,
 		listenEP:         listenEP,
 		pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint),
 	}
-	p, ok := stk.TransportProtocolInstance(ProtocolNumber).(*protocol)
-	if !ok {
-		panic(fmt.Sprintf("unable to get TCP protocol instance from stack: %+v", stk))
-	}
-	l.synRcvdCount = p.SynRcvdCounter()
 
 	rand.Read(l.nonce[0][:])
 	rand.Read(l.nonce[1][:])
@@ -199,6 +186,14 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
 	return (v - l.cookieHash(id, cookieTS, 1)) & hashMask, true
 }
 
+func (l *listenContext) useSynCookies() bool {
+	var alwaysUseSynCookies tcpip.TCPAlwaysUseSynCookies
+	if err := l.stack.TransportProtocolOption(header.TCPProtocolNumber, &alwaysUseSynCookies); err != nil {
+		panic(fmt.Sprintf("TransportProtocolOption(%d, %T) = %s", header.TCPProtocolNumber, alwaysUseSynCookies, err))
+	}
+	return bool(alwaysUseSynCookies) || (l.listenEP != nil && l.listenEP.synRcvdBacklogFull())
+}
+
 // createConnectingEndpoint creates a new endpoint in a connecting state, with
 // the connection parameters given by the arguments.
 func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, tcpip.Error) {
@@ -307,6 +302,7 @@ func (l *listenContext) startHandshake(s *segment, opts *header.TCPSynOptions, q
 
 	// Initialize and start the handshake.
 	h := ep.newPassiveHandshake(isn, irs, opts, deferAccept)
+	h.listenEP = l.listenEP
 	h.start()
 	return h, nil
 }
@@ -485,7 +481,6 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 	}
 
 	go func() {
-		defer ctx.synRcvdCount.dec()
 		if err := h.complete(); err != nil {
 			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 			e.stats.FailedConnectionAttempts.Increment()
@@ -497,24 +492,29 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
 		h.ep.startAcceptedLoop()
 		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 		e.deliverAccepted(h.ep, false /*withSynCookie*/)
-	}() // S/R-SAFE: synRcvdCount is the barrier.
+	}()
 
 	return nil
 }
 
-func (e *endpoint) incSynRcvdCount() bool {
+func (e *endpoint) synRcvdBacklogFull() bool {
 	e.acceptMu.Lock()
-	canInc := int(atomic.LoadInt32(&e.synRcvdCount)) < cap(e.acceptedChan)
+	acceptedChanCap := cap(e.acceptedChan)
 	e.acceptMu.Unlock()
-	if canInc {
-		atomic.AddInt32(&e.synRcvdCount, 1)
-	}
-	return canInc
+	// The allocated accepted channel size would always be one greater than the
+	// listen backlog. But, the SYNRCVD connections count is always checked
+	// against the listen backlog value for Linux parity reason.
+	// https://github.com/torvalds/linux/blob/7acac4b3196/include/net/inet_connection_sock.h#L280
+	//
+	// We maintain an equality check here as the synRcvdCount is incremented
+	// and compared only from a single listener context and the capacity of
+	// the accepted channel can only increase by a new listen call.
+	return int(atomic.LoadInt32(&e.synRcvdCount)) == acceptedChanCap-1
 }
 
 func (e *endpoint) acceptQueueIsFull() bool {
 	e.acceptMu.Lock()
-	full := len(e.acceptedChan)+int(atomic.LoadInt32(&e.synRcvdCount)) >= cap(e.acceptedChan)
+	full := len(e.acceptedChan) == cap(e.acceptedChan)
 	e.acceptMu.Unlock()
 	return full
 }
@@ -539,17 +539,13 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err
 	switch {
 	case s.flags == header.TCPFlagSyn:
 		opts := parseSynSegmentOptions(s)
-		if ctx.synRcvdCount.inc() {
-			// Only handle the syn if the following conditions hold
-			//   - accept queue is not full.
-			//   - number of connections in synRcvd state is less than the
-			//     backlog.
-			if !e.acceptQueueIsFull() && e.incSynRcvdCount() {
+		if !ctx.useSynCookies() {
+			if !e.acceptQueueIsFull() {
 				s.incRef()
+				atomic.AddInt32(&e.synRcvdCount, 1)
 				_ = e.handleSynSegment(ctx, s, &opts)
 				return nil
 			}
-			ctx.synRcvdCount.dec()
 			e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
 			e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment()
 			e.stack.Stats().DroppedPackets.Increment()
@@ -615,25 +611,6 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err
 			return nil
 		}
 
-		if !ctx.synRcvdCount.synCookiesInUse() {
-			// When not using SYN cookies, as per RFC 793, section 3.9, page 64:
-			// Any acknowledgment is bad if it arrives on a connection still in
-			// the LISTEN state.  An acceptable reset segment should be formed
-			// for any arriving ACK-bearing segment.  The RST should be
-			// formatted as follows:
-			//
-			//  <SEQ=SEG.ACK><CTL=RST>
-			//
-			// Send a reset as this is an ACK for which there is no
-			// half open connections and we are not using cookies
-			// yet.
-			//
-			// The only time we should reach here when a connection
-			// was opened and closed really quickly and a delayed
-			// ACK was received from the sender.
-			return replyWithReset(e.stack, s, e.sendTOS, e.ttl)
-		}
-
 		iss := s.ackNumber - 1
 		irs := s.sequenceNumber - 1
 
@@ -651,7 +628,23 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err
 		if !ok || int(data) >= len(mssTable) {
 			e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment()
 			e.stack.Stats().DroppedPackets.Increment()
-			return nil
+
+			// When not using SYN cookies, as per RFC 793, section 3.9, page 64:
+			// Any acknowledgment is bad if it arrives on a connection still in
+			// the LISTEN state.  An acceptable reset segment should be formed
+			// for any arriving ACK-bearing segment.  The RST should be
+			// formatted as follows:
+			//
+			//  <SEQ=SEG.ACK><CTL=RST>
+			//
+			// Send a reset as this is an ACK for which there is no
+			// half open connections and we are not using cookies
+			// yet.
+			//
+			// The only time we should reach here when a connection
+			// was opened and closed really quickly and a delayed
+			// ACK was received from the sender.
+			return replyWithReset(e.stack, s, e.sendTOS, e.ttl)
 		}
 		e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment()
 		// Create newly accepted endpoint and deliver it.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index a9e978cf6..8f0f0c3e9 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -65,11 +65,12 @@ const (
 // NOTE: handshake.ep.mu is held during handshake processing. It is released if
 // we are going to block and reacquired when we start processing an event.
 type handshake struct {
-	ep     *endpoint
-	state  handshakeState
-	active bool
-	flags  header.TCPFlags
-	ackNum seqnum.Value
+	ep       *endpoint
+	listenEP *endpoint
+	state    handshakeState
+	active   bool
+	flags    header.TCPFlags
+	ackNum   seqnum.Value
 
 	// iss is the initial send sequence number, as defined in RFC 793.
 	iss seqnum.Value
@@ -394,6 +395,15 @@ func (h *handshake) synRcvdState(s *segment) tcpip.Error {
 			return nil
 		}
 
+		// Drop the ACK if the accept queue is full.
+		// https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523
+		// We could abort the connection as well with a tunable as in
+		// https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788
+		if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() {
+			listenEP.stack.Stats().DroppedPackets.Increment()
+			return nil
+		}
+
 		// Update timestamp if required. See RFC7323, section-4.3.
 		if h.ep.sendTSOk && s.parsedOptions.TS {
 			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index f6a16f96e..d6d68f128 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -565,17 +565,15 @@ func TestV4AcceptOnV4(t *testing.T) {
 }
 
 func testV4ListenClose(t *testing.T, c *context.Context) {
-	// Set the SynRcvd threshold to zero to force a syn cookie based accept
-	// to happen.
-	var opt tcpip.TCPSynRcvdCountThresholdOption
+	opt := tcpip.TCPAlwaysUseSynCookies(true)
 	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
-		t.Fatalf("setting TCPSynRcvdCountThresholdOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
-	const n = uint16(32)
+	const n = 32
 
 	// Start listening.
-	if err := c.EP.Listen(int(tcp.SynRcvdCountThreshold + 1)); err != nil {
+	if err := c.EP.Listen(n); err != nil {
 		t.Fatalf("Listen failed: %v", err)
 	}
 
@@ -591,9 +589,9 @@ func testV4ListenClose(t *testing.T, c *context.Context) {
 		})
 	}
 
-	// Each of these ACK's will cause a syn-cookie based connection to be
+	// Each of these ACKs will cause a syn-cookie based connection to be
 	// accepted and delivered to the listening endpoint.
-	for i := uint16(0); i < n; i++ {
+	for i := 0; i < n; i++ {
 		b := c.GetPacket()
 		tcp := header.TCP(header.IPv4(b).Payload())
 		iss := seqnum.Value(tcp.SequenceNumber())
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index c5daba232..9438056f9 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2474,6 +2474,20 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error {
 // Listen puts the endpoint in "listen" mode, which allows it to accept
 // new connections.
 func (e *endpoint) Listen(backlog int) tcpip.Error {
+	if uint32(backlog) > MaxListenBacklog {
+		// Linux treats incoming backlog as uint with a limit defined by
+		// sysctl_somaxconn.
+		// https://github.com/torvalds/linux/blob/7acac4b3196/net/socket.c#L1666
+		//
+		// We use the backlog to allocate a channel of that size, hence enforce
+		// a hard limit for the backlog.
+		backlog = MaxListenBacklog
+	} else {
+		// Accept one more than the configured listen backlog to keep in parity with
+		// Linux. Ref, because of missing equality check here:
+		// https://github.com/torvalds/linux/blob/7acac4b3196/include/net/sock.h#L937
+		backlog++
+	}
 	err := e.listen(backlog)
 	if err != nil {
 		if !err.IgnoreStats() {
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 2a4667906..230fa6ebe 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -68,6 +68,9 @@ const (
 	// DefaultSynRetries is the default value for the number of SYN retransmits
 	// before a connect is aborted.
 	DefaultSynRetries = 6
+
+	// MaxListenBacklog is the maximum limit of listen backlog supported.
+	MaxListenBacklog = 1024
 )
 
 const (
@@ -75,63 +78,6 @@ const (
 	ccCubic = "cubic"
 )
 
-// syncRcvdCounter tracks the number of endpoints in the SYN-RCVD state. The
-// value is protected by a mutex so that we can increment only when it's
-// guaranteed not to go above a threshold.
-type synRcvdCounter struct {
-	sync.Mutex
-	value     uint64
-	pending   sync.WaitGroup
-	threshold uint64
-}
-
-// inc tries to increment the global number of endpoints in SYN-RCVD state. It
-// succeeds if the increment doesn't make the count go beyond the threshold, and
-// fails otherwise.
-func (s *synRcvdCounter) inc() bool {
-	s.Lock()
-	defer s.Unlock()
-	if s.value >= s.threshold {
-		return false
-	}
-
-	s.pending.Add(1)
-	s.value++
-
-	return true
-}
-
-// dec atomically decrements the global number of endpoints in SYN-RCVD
-// state. It must only be called if a previous call to inc succeeded.
-func (s *synRcvdCounter) dec() {
-	s.Lock()
-	defer s.Unlock()
-	s.value--
-	s.pending.Done()
-}
-
-// synCookiesInUse returns true if the synRcvdCount is greater than
-// SynRcvdCountThreshold.
-func (s *synRcvdCounter) synCookiesInUse() bool {
-	s.Lock()
-	defer s.Unlock()
-	return s.value >= s.threshold
-}
-
-// SetThreshold sets synRcvdCounter.Threshold to ths new threshold.
-func (s *synRcvdCounter) SetThreshold(threshold uint64) {
-	s.Lock()
-	defer s.Unlock()
-	s.threshold = threshold
-}
-
-// Threshold returns the current value of synRcvdCounter.Threhsold.
-func (s *synRcvdCounter) Threshold() uint64 {
-	s.Lock()
-	defer s.Unlock()
-	return s.threshold
-}
-
 type protocol struct {
 	stack *stack.Stack
 
@@ -139,6 +85,7 @@ type protocol struct {
 	sackEnabled                bool
 	recovery                   tcpip.TCPRecovery
 	delayEnabled               bool
+	alwaysUseSynCookies        bool
 	sendBufferSize             tcpip.TCPSendBufferSizeRangeOption
 	recvBufferSize             tcpip.TCPReceiveBufferSizeRangeOption
 	congestionControl          string
@@ -150,7 +97,6 @@ type protocol struct {
 	minRTO                     time.Duration
 	maxRTO                     time.Duration
 	maxRetries                 uint32
-	synRcvdCount               synRcvdCounter
 	synRetries                 uint8
 	dispatcher                 dispatcher
 }
@@ -373,9 +319,9 @@ func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip
 		p.mu.Unlock()
 		return nil
 
-	case *tcpip.TCPSynRcvdCountThresholdOption:
+	case *tcpip.TCPAlwaysUseSynCookies:
 		p.mu.Lock()
-		p.synRcvdCount.SetThreshold(uint64(*v))
+		p.alwaysUseSynCookies = bool(*v)
 		p.mu.Unlock()
 		return nil
 
@@ -480,9 +426,9 @@ func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Er
 		p.mu.RUnlock()
 		return nil
 
-	case *tcpip.TCPSynRcvdCountThresholdOption:
+	case *tcpip.TCPAlwaysUseSynCookies:
 		p.mu.RLock()
-		*v = tcpip.TCPSynRcvdCountThresholdOption(p.synRcvdCount.Threshold())
+		*v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies)
 		p.mu.RUnlock()
 		return nil
 
@@ -507,12 +453,6 @@ func (p *protocol) Wait() {
 	p.dispatcher.wait()
 }
 
-// SynRcvdCounter returns a reference to the synRcvdCount for this protocol
-// instance.
-func (p *protocol) SynRcvdCounter() *synRcvdCounter {
-	return &p.synRcvdCount
-}
-
 // Parse implements stack.TransportProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
 	return parse.TCP(pkt)
@@ -537,7 +477,6 @@ func NewProtocol(s *stack.Stack) stack.TransportProtocol {
 		lingerTimeout:              DefaultTCPLingerTimeout,
 		timeWaitTimeout:            DefaultTCPTimeWaitTimeout,
 		timeWaitReuse:              tcpip.TCPTimeWaitReuseLoopbackOnly,
-		synRcvdCount:               synRcvdCounter{threshold: SynRcvdCountThreshold},
 		synRetries:                 DefaultSynRetries,
 		minRTO:                     MinRTO,
 		maxRTO:                     MaxRTO,
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index 81f800cad..20c9761f2 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -160,12 +160,9 @@ func TestSackPermittedAccept(t *testing.T) {
 					defer c.Cleanup()
 
 					if tc.cookieEnabled {
-						// Set the SynRcvd threshold to
-						// zero to force a syn cookie
-						// based accept to happen.
-						var opt tcpip.TCPSynRcvdCountThresholdOption
+						opt := tcpip.TCPAlwaysUseSynCookies(true)
 						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
-							t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+							t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
 						}
 					}
 					setStackSACKPermitted(t, c, sackEnabled)
@@ -235,12 +232,9 @@ func TestSackDisabledAccept(t *testing.T) {
 					defer c.Cleanup()
 
 					if tc.cookieEnabled {
-						// Set the SynRcvd threshold to
-						// zero to force a syn cookie
-						// based accept to happen.
-						var opt tcpip.TCPSynRcvdCountThresholdOption
+						opt := tcpip.TCPAlwaysUseSynCookies(true)
 						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
-							t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+							t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
 						}
 					}
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 9c23469f2..5605a4390 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -955,11 +955,7 @@ func TestUserSuppliedMSSOnConnect(t *testing.T) {
 // when completing the handshake for a new TCP connection from a TCP
 // listening socket. It should be present in the sent TCP SYN-ACK segment.
 func TestUserSuppliedMSSOnListenAccept(t *testing.T) {
-	const (
-		nonSynCookieAccepts = 2
-		totalAccepts        = 4
-		mtu                 = 5000
-	)
+	const mtu = 5000
 
 	ips := []struct {
 		name     string
@@ -1033,12 +1029,6 @@ func TestUserSuppliedMSSOnListenAccept(t *testing.T) {
 
 					ip.createEP(c)
 
-					// Set the SynRcvd threshold to force a syn cookie based accept to happen.
-					opt := tcpip.TCPSynRcvdCountThresholdOption(nonSynCookieAccepts)
-					if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
-						t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
-					}
-
 					if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, int(test.setMSS)); err != nil {
 						t.Fatalf("SetSockOptInt(MaxSegOption, %d): %s", test.setMSS, err)
 					}
@@ -1048,13 +1038,17 @@ func TestUserSuppliedMSSOnListenAccept(t *testing.T) {
 						t.Fatalf("Bind(%+v): %s:", bindAddr, err)
 					}
 
-					if err := c.EP.Listen(totalAccepts); err != nil {
-						t.Fatalf("Listen(%d): %s:", totalAccepts, err)
+					backlog := 5
+					// Keep the number of client requests twice to the backlog
+					// such that half of the connections do not use syncookies
+					// and the other half does.
+					clientConnects := backlog * 2
+
+					if err := c.EP.Listen(backlog); err != nil {
+						t.Fatalf("Listen(%d): %s:", backlog, err)
 					}
 
-					// The first nonSynCookieAccepts packets sent will trigger a gorooutine
-					// based accept. The rest will trigger a cookie based accept.
-					for i := 0; i < totalAccepts; i++ {
+					for i := 0; i < clientConnects; i++ {
 						// Send a SYN requests.
 						iss := seqnum.Value(i)
 						srcPort := context.TestPort + uint16(i)
@@ -3087,11 +3081,9 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 	c := context.New(t, mtu)
 	defer c.Cleanup()
 
-	// Set the SynRcvd threshold to zero to force a syn cookie based accept
-	// to happen.
-	opt := tcpip.TCPSynRcvdCountThresholdOption(0)
+	opt := tcpip.TCPAlwaysUseSynCookies(true)
 	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
-		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+		t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
 	}
 
 	// Create EP and start listening.
@@ -5363,7 +5355,7 @@ func TestListenBacklogFull(t *testing.T) {
 	}
 
 	lastPortOffset := uint16(0)
-	for ; int(lastPortOffset) < listenBacklog; lastPortOffset++ {
+	for ; int(lastPortOffset) < listenBacklog+1; lastPortOffset++ {
 		executeHandshake(t, c, context.TestPort+lastPortOffset, false /*synCookieInUse */)
 	}
 
@@ -5671,15 +5663,13 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 	}
 
 	// Test acceptance.
-	// Start listening.
-	listenBacklog := 1
-	if err := c.EP.Listen(listenBacklog); err != nil {
+	if err := c.EP.Listen(0); err != nil {
 		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Send two SYN's the first one should get a SYN-ACK, the
 	// second one should not get any response and is dropped as
-	// the synRcvd count will be equal to backlog.
+	// the accept queue is full.
 	irs := seqnum.Value(context.TestInitialSequenceNumber)
 	c.SendPacket(nil, &context.Headers{
 		SrcPort: context.TestPort,
@@ -5701,23 +5691,7 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 	}
 	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
 
-	// Now execute send one more SYN. The stack should not respond as the backlog
-	// is full at this point.
-	//
-	// NOTE: we did not complete the handshake for the previous one so the
-	// accept backlog should be empty and there should be one connection in
-	// synRcvd state.
-	c.SendPacket(nil, &context.Headers{
-		SrcPort: context.TestPort + 1,
-		DstPort: context.StackPort,
-		Flags:   header.TCPFlagSyn,
-		SeqNum:  seqnum.Value(889),
-		RcvWnd:  30000,
-	})
-	c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
-
-	// Now complete the previous connection and verify that there is a connection
-	// to accept.
+	// Now complete the previous connection.
 	// Send ACK.
 	c.SendPacket(nil, &context.Headers{
 		SrcPort: context.TestPort,
@@ -5728,11 +5702,24 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 		RcvWnd:  30000,
 	})
 
-	// Try to accept the connections in the backlog.
+	// Verify if that is delivered to the accept queue.
 	we, ch := waiter.NewChannelEntry(nil)
 	c.WQ.EventRegister(&we, waiter.ReadableEvents)
 	defer c.WQ.EventUnregister(&we)
+	<-ch
+
+	// Now execute send one more SYN. The stack should not respond as the backlog
+	// is full at this point.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort + 1,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  seqnum.Value(889),
+		RcvWnd:  30000,
+	})
+	c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
 
+	// Try to accept the connections in the backlog.
 	newEP, _, err := c.EP.Accept(nil)
 	if _, ok := err.(*tcpip.ErrWouldBlock); ok {
 		// Wait for connection to be established.
@@ -5764,11 +5751,6 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
-	opt := tcpip.TCPSynRcvdCountThresholdOption(1)
-	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
-		t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
-	}
-
 	// Create TCP endpoint.
 	var err tcpip.Error
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
@@ -5781,9 +5763,8 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 		t.Fatalf("Bind failed: %s", err)
 	}
 
-	// Start listening.
-	listenBacklog := 1
-	if err := c.EP.Listen(listenBacklog); err != nil {
+	// Test for SynCookies usage after filling up the backlog.
+	if err := c.EP.Listen(0); err != nil {
 		t.Fatalf("Listen failed: %s", err)
 	}
 
@@ -6066,7 +6047,7 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
 	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
-	if err := c.EP.Listen(1); err != nil {
+	if err := c.EP.Listen(0); err != nil {
 		t.Fatalf("Listen failed: %s", err)
 	}
 
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index 2949588ce..1deb1fe4d 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -139,9 +139,9 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS
 	defer c.Cleanup()
 
 	if cookieEnabled {
-		var opt tcpip.TCPSynRcvdCountThresholdOption
+		opt := tcpip.TCPAlwaysUseSynCookies(true)
 		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
-			t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
 		}
 	}
 
@@ -202,9 +202,9 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd
 	defer c.Cleanup()
 
 	if cookieEnabled {
-		var opt tcpip.TCPSynRcvdCountThresholdOption
+		opt := tcpip.TCPAlwaysUseSynCookies(true)
 		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
-			t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+			t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
 		}
 	}
 
diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl
index 34e83ec49..634c15727 100644
--- a/test/packetimpact/runner/defs.bzl
+++ b/test/packetimpact/runner/defs.bzl
@@ -246,6 +246,12 @@ ALL_TESTS = [
         expect_netstack_failure = True,
     ),
     PacketimpactTestInfo(
+        name = "tcp_listen_backlog",
+    ),
+    PacketimpactTestInfo(
+        name = "tcp_syncookie",
+    ),
+    PacketimpactTestInfo(
         name = "icmpv6_param_problem",
     ),
     PacketimpactTestInfo(
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 92103c1e9..83ff70951 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -385,6 +385,26 @@ packetimpact_testbench(
     ],
 )
 
+packetimpact_testbench(
+    name = "tcp_listen_backlog",
+    srcs = ["tcp_listen_backlog_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_testbench(
+    name = "tcp_syncookie",
+    srcs = ["tcp_syncookie_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
 validate_all_tests()
 
 [packetimpact_go_test(
diff --git a/test/packetimpact/tests/tcp_listen_backlog_test.go b/test/packetimpact/tests/tcp_listen_backlog_test.go
new file mode 100644
index 000000000..26c812d0a
--- /dev/null
+++ b/test/packetimpact/tests/tcp_listen_backlog_test.go
@@ -0,0 +1,86 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_listen_backlog_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.Initialize(flag.CommandLine)
+}
+
+// TestTCPListenBacklog tests for a listening endpoint behavior:
+// (1) reply to more SYNs than what is configured as listen backlog
+// (2) ignore ACKs (that complete a handshake) when the accept queue is full
+// (3) ignore incoming SYNs when the accept queue is full
+func TestTCPListenBacklog(t *testing.T) {
+	dut := testbench.NewDUT(t)
+
+	// Listening endpoint accepts one more connection than the listen backlog.
+	listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 0 /*backlog*/)
+
+	var establishedConn testbench.TCPIPv4
+	var incompleteConn testbench.TCPIPv4
+
+	// Test if the DUT listener replies to more SYNs than listen backlog+1
+	for i, conn := range []*testbench.TCPIPv4{&establishedConn, &incompleteConn} {
+		*conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+		// Expect dut connection to have transitioned to SYN-RCVD state.
+		conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)})
+		if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+			t.Fatalf("expected SYN-ACK for %d connection, %s", i, err)
+		}
+	}
+	defer establishedConn.Close(t)
+	defer incompleteConn.Close(t)
+
+	// Send the ACK to complete handshake.
+	establishedConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)})
+	dut.PollOne(t, listenFd, unix.POLLIN, time.Second)
+
+	// Send the ACK to complete handshake, expect this to be ignored by the
+	// listener.
+	incompleteConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)})
+
+	// Drain the accept queue to enable poll for subsequent connections on the
+	// listener.
+	dut.Accept(t, listenFd)
+
+	// The ACK for the incomplete connection should be ignored by the
+	// listening endpoint and the poll on listener should now time out.
+	if pfds := dut.Poll(t, []unix.PollFd{{Fd: listenFd, Events: unix.POLLIN}}, time.Second); len(pfds) != 0 {
+		t.Fatalf("got dut.Poll(...) = %#v", pfds)
+	}
+
+	// Re-send the ACK to complete handshake and re-fill the accept-queue.
+	incompleteConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)})
+	dut.PollOne(t, listenFd, unix.POLLIN, time.Second)
+
+	// Now initiate a new connection when the accept queue is full.
+	connectingConn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer connectingConn.Close(t)
+	// Expect dut connection to drop the SYN and let the client stay in SYN_SENT state.
+	connectingConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)})
+	if got, err := connectingConn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err == nil {
+		t.Fatalf("expected no SYN-ACK, but got %s", got)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_syncookie_test.go b/test/packetimpact/tests/tcp_syncookie_test.go
new file mode 100644
index 000000000..1c21c62ff
--- /dev/null
+++ b/test/packetimpact/tests/tcp_syncookie_test.go
@@ -0,0 +1,70 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_syncookie_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.Initialize(flag.CommandLine)
+}
+
+// TestSynCookie test if the DUT listener is replying back using syn cookies.
+// The test does not complete the handshake by not sending the ACK to SYNACK.
+// When syncookies are not used, this forces the listener to retransmit SYNACK.
+// And when syncookies are being used, there is no such retransmit.
+func TestTCPSynCookie(t *testing.T) {
+	dut := testbench.NewDUT(t)
+
+	// Listening endpoint accepts one more connection than the listen backlog.
+	_, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/)
+
+	var withoutSynCookieConn testbench.TCPIPv4
+	var withSynCookieConn testbench.TCPIPv4
+
+	// Test if the DUT listener replies to more SYNs than listen backlog+1
+	for _, conn := range []*testbench.TCPIPv4{&withoutSynCookieConn, &withSynCookieConn} {
+		*conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	}
+	defer withoutSynCookieConn.Close(t)
+	defer withSynCookieConn.Close(t)
+
+	checkSynAck := func(t *testing.T, conn *testbench.TCPIPv4, expectRetransmit bool) {
+		// Expect dut connection to have transitioned to SYN-RCVD state.
+		conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)})
+		if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+			t.Fatalf("expected SYN-ACK, but got %s", err)
+		}
+
+		// If the DUT listener is using syn cookies, it will not retransmit SYNACK
+		got, err := conn.ExpectData(t, &testbench.TCP{SeqNum: testbench.Uint32(uint32(*conn.RemoteSeqNum(t) - 1)), Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, 2*time.Second)
+		if expectRetransmit && err != nil {
+			t.Fatalf("expected retransmitted SYN-ACK, but got %s", err)
+		}
+		if !expectRetransmit && err == nil {
+			t.Fatalf("expected no retransmitted SYN-ACK, but got %s", got)
+		}
+	}
+
+	t.Run("without syncookies", func(t *testing.T) { checkSynAck(t, &withoutSynCookieConn, true /*expectRetransmit*/) })
+	t.Run("with syncookies", func(t *testing.T) { checkSynAck(t, &withSynCookieConn, false /*expectRetransmit*/) })
+}
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 597b5bcb1..d391363fb 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -489,13 +489,6 @@ void TestListenWhileConnect(const TestParam& param,
   TestAddress const& listener = param.listener;
   TestAddress const& connector = param.connector;
 
-  constexpr int kBacklog = 2;
-  // Linux completes one more connection than the listen backlog argument.
-  // To ensure that there is at least one client connection that stays in
-  // connecting state, keep 2 more client connections than the listen backlog.
-  // gVisor differs in this behavior though, gvisor.dev/issue/3153.
-  constexpr int kClients = kBacklog + 2;
-
   // Create the listening socket.
   FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
       Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
@@ -503,6 +496,13 @@ void TestListenWhileConnect(const TestParam& param,
   ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
                    listener.addr_len),
               SyscallSucceeds());
+  // This test is only interested in deterministically getting a socket in
+  // connecting state. For that, we use a listen backlog of zero which would
+  // mean there is exactly one connection that gets established and is enqueued
+  // to the accept queue. We poll on the listener to ensure that is enqueued.
+  // After that the subsequent client connect will stay in connecting state as
+  // the accept queue is full.
+  constexpr int kBacklog = 0;
   ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
 
   // Get the port bound by the listening socket.
@@ -515,42 +515,49 @@ void TestListenWhileConnect(const TestParam& param,
 
   sockaddr_storage conn_addr = connector.addr;
   ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
-  std::vector<FileDescriptor> clients;
-  for (int i = 0; i < kClients; i++) {
-    FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
-        Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
-    int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
-                      connector.addr_len);
-    if (ret != 0) {
-      EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
-      clients.push_back(std::move(client));
-    }
+  FileDescriptor established_client = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+  ASSERT_THAT(
+      connect(established_client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+              connector.addr_len),
+      SyscallSucceeds());
+
+  // Ensure that the accept queue has the completed connection.
+  constexpr int kTimeout = 10000;
+  pollfd pfd = {
+      .fd = listen_fd.get(),
+      .events = POLLIN,
+  };
+  ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+  ASSERT_EQ(pfd.revents, POLLIN);
+
+  FileDescriptor connecting_client = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+  // Keep the last client in connecting state.
+  int ret =
+      connect(connecting_client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+              connector.addr_len);
+  if (ret != 0) {
+    EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
   }
 
   stopListen(listen_fd);
 
-  for (auto& client : clients) {
-    constexpr int kTimeout = 10000;
+  std::array<std::pair<int, int>, 2> sockets = {
+      std::make_pair(established_client.get(), ECONNRESET),
+      std::make_pair(connecting_client.get(), ECONNREFUSED),
+  };
+  for (size_t i = 0; i < sockets.size(); i++) {
+    SCOPED_TRACE(absl::StrCat("i=", i));
+    auto [fd, expected_errno] = sockets[i];
     pollfd pfd = {
-        .fd = client.get(),
-        .events = POLLIN,
+        .fd = fd,
     };
-    // When the listening socket is closed, then we expect the remote to reset
-    // the connection.
-    ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
-    ASSERT_EQ(pfd.revents, POLLIN | POLLHUP | POLLERR);
+    // When the listening socket is closed, the peer would reset the connection.
+    EXPECT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+    EXPECT_EQ(pfd.revents, POLLHUP | POLLERR);
     char c;
-    // Subsequent read can fail with:
-    // ECONNRESET: If the client connection was established and was reset by the
-    // remote.
-    // ECONNREFUSED: If the client connection failed to be established.
-    ASSERT_THAT(read(client.get(), &c, sizeof(c)),
-                AnyOf(SyscallFailsWithErrno(ECONNRESET),
-                      SyscallFailsWithErrno(ECONNREFUSED)));
-    // The last client connection would be in connecting (SYN_SENT) state.
-    if (client.get() == clients[kClients - 1].get()) {
-      ASSERT_EQ(errno, ECONNREFUSED) << strerror(errno);
-    }
+    EXPECT_THAT(read(fd, &c, sizeof(c)), SyscallFailsWithErrno(expected_errno));
   }
 }
 
@@ -570,7 +577,59 @@ TEST_P(SocketInetLoopbackTest, TCPListenShutdownWhileConnect) {
 // random save as established connections which can't be delivered to the accept
 // queue because the queue is full are not correctly delivered after restore
 // causing the last accept to timeout on the restore.
-TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) {
+TEST_P(SocketInetLoopbackTest, TCPAcceptBacklogSizes_NoRandomSave) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  // Create the listening socket.
+  const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+  std::array<int, 3> backlogs = {-1, 0, 1};
+  for (auto& backlog : backlogs) {
+    ASSERT_THAT(listen(listen_fd.get(), backlog), SyscallSucceeds());
+
+    int expected_accepts;
+    if (backlog < 0) {
+      expected_accepts = 1024;
+    } else {
+      expected_accepts = backlog + 1;
+    }
+    for (int i = 0; i < expected_accepts; i++) {
+      SCOPED_TRACE(absl::StrCat("i=", i));
+      // Connect to the listening socket.
+      const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+          Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+      sockaddr_storage conn_addr = connector.addr;
+      ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+      ASSERT_THAT(
+          RetryEINTR(connect)(conn_fd.get(),
+                              reinterpret_cast<struct sockaddr*>(&conn_addr),
+                              connector.addr_len),
+          SyscallSucceeds());
+      const FileDescriptor accepted =
+          ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+    }
+  }
+}
+
+// TODO(b/157236388): Remove _NoRandomSave once bug is fixed. Test fails w/
+// random save as established connections which can't be delivered to the accept
+// queue because the queue is full are not correctly delivered after restore
+// causing the last accept to timeout on the restore.
+TEST_P(SocketInetLoopbackTest, TCPBacklog_NoRandomSave) {
   auto const& param = GetParam();
 
   TestAddress const& listener = param.listener;
@@ -595,6 +654,7 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) {
       ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
   int i = 0;
   while (1) {
+    SCOPED_TRACE(absl::StrCat("i=", i));
     int ret;
 
     // Connect to the listening socket.
@@ -620,103 +680,133 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) {
     i++;
   }
 
+  int client_conns = i;
+  int accepted_conns = 0;
   for (; i != 0; i--) {
-    // Accept the connection.
-    //
-    // We have to assign a name to the accepted socket, as unamed temporary
-    // objects are destructed upon full evaluation of the expression it is in,
-    // potentially causing the connecting socket to fail to shutdown properly.
-    auto accepted =
-        ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+    SCOPED_TRACE(absl::StrCat("i=", i));
+    pollfd pfd = {
+        .fd = listen_fd.get(),
+        .events = POLLIN,
+    };
+    // Look for incoming connections to accept. The last connect request could
+    // be established from the client side, but the ACK of the handshake could
+    // be dropped by the listener if the accept queue was filled up by the
+    // previous connect.
+    int ret;
+    ASSERT_THAT(ret = poll(&pfd, 1, 3000), SyscallSucceeds());
+    if (ret == 0) break;
+    if (pfd.revents == POLLIN) {
+      // Accept the connection.
+      //
+      // We have to assign a name to the accepted socket, as unamed temporary
+      // objects are destructed upon full evaluation of the expression it is in,
+      // potentially causing the connecting socket to fail to shutdown properly.
+      auto accepted =
+          ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+      accepted_conns++;
+    }
   }
+  // We should accept at least listen backlog + 1 connections. As the stack is
+  // enqueuing established connections to the accept queue, newer SYNs could
+  // still be replied to causing those client connections would be accepted as
+  // we start dequeuing the queue.
+  ASSERT_GE(accepted_conns, kBacklogSize + 1);
+  ASSERT_GE(client_conns, accepted_conns);
 }
 
-// Test if the stack completes atmost listen backlog number of client
-// connections. It exercises the path of the stack that enqueues completed
-// connections to accept queue vs new incoming SYNs.
-TEST_P(SocketInetLoopbackTest, TCPConnectBacklog_NoRandomSave) {
-  const auto& param = GetParam();
-  const TestAddress& listener = param.listener;
-  const TestAddress& connector = param.connector;
+// TODO(b/157236388): Remove _NoRandomSave once bug is fixed. Test fails w/
+// random save as established connections which can't be delivered to the accept
+// queue because the queue is full are not correctly delivered after restore
+// causing the last accept to timeout on the restore.
+TEST_P(SocketInetLoopbackTest, TCPBacklogAcceptAll_NoRandomSave) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
 
+  // Create the listening socket.
+  FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
   constexpr int kBacklog = 1;
-  // Keep the number of client connections more than the listen backlog.
-  // Linux completes one more connection than the listen backlog argument.
-  // gVisor differs in this behavior though, gvisor.dev/issue/3153.
-  int kClients = kBacklog + 2;
-  if (IsRunningOnGvisor()) {
-    kClients--;
-  }
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
 
-  // Run the following test for few iterations to test race between accept queue
-  // getting filled with incoming SYNs.
-  for (int num = 0; num < 10; num++) {
-    FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
-        Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
-    sockaddr_storage listen_addr = listener.addr;
-    ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
-                     listener.addr_len),
-                SyscallSucceeds());
-    ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
 
-    socklen_t addrlen = listener.addr_len;
-    ASSERT_THAT(
-        getsockname(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
-                    &addrlen),
-        SyscallSucceeds());
-    uint16_t const port =
-        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
-    sockaddr_storage conn_addr = connector.addr;
-    ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
 
-    std::vector<FileDescriptor> clients;
-    // Issue multiple non-blocking client connects.
-    for (int i = 0; i < kClients; i++) {
-      FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
-          Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
-      int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
-                        connector.addr_len);
-      if (ret != 0) {
-        EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
-      }
-      clients.push_back(std::move(client));
+  // Fill up the accept queue and trigger more client connections which would be
+  // waiting to be accepted.
+  std::array<FileDescriptor, kBacklog + 1> established_clients;
+  for (auto& fd : established_clients) {
+    fd = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    ASSERT_THAT(connect(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                        connector.addr_len),
+                SyscallSucceeds());
+  }
+  std::array<FileDescriptor, kBacklog> waiting_clients;
+  for (auto& fd : waiting_clients) {
+    fd = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+    int ret = connect(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                      connector.addr_len);
+    if (ret != 0) {
+      EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
     }
+  }
 
-    // Now that client connects are issued, wait for the accept queue to get
-    // filled and ensure no new client connection is completed.
-    for (int i = 0; i < kClients; i++) {
-      pollfd pfd = {
-          .fd = clients[i].get(),
-          .events = POLLOUT,
-      };
-      if (i < kClients - 1) {
-        // Poll for client side connection completions with a large timeout.
-        // We cannot poll on the listener side without calling accept as poll
-        // stays level triggered with non-zero accept queue length.
-        //
-        // Client side poll would not guarantee that the completed connection
-        // has been enqueued in to the acccept queue, but the fact that the
-        // listener ACKd the SYN, means that it cannot complete any new incoming
-        // SYNs when it has already ACKd for > backlog number of SYNs.
-        ASSERT_THAT(poll(&pfd, 1, 10000), SyscallSucceedsWithValue(1))
-            << "num=" << num << " i=" << i << " kClients=" << kClients;
-        ASSERT_EQ(pfd.revents, POLLOUT) << "num=" << num << " i=" << i;
-      } else {
-        // Now that we expect accept queue filled up, ensure that the last
-        // client connection never completes with a smaller poll timeout.
-        ASSERT_THAT(poll(&pfd, 1, 1000), SyscallSucceedsWithValue(0))
-            << "num=" << num << " i=" << i;
-      }
+  auto accept_connection = [&]() {
+    constexpr int kTimeout = 10000;
+    pollfd pfd = {
+        .fd = listen_fd.get(),
+        .events = POLLIN,
+    };
+    ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+    ASSERT_EQ(pfd.revents, POLLIN);
+    // Accept the connection.
+    //
+    // We have to assign a name to the accepted socket, as unamed temporary
+    // objects are destructed upon full evaluation of the expression it is in,
+    // potentially causing the connecting socket to fail to shutdown properly.
+    auto accepted =
+        ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+  };
 
-      ASSERT_THAT(close(clients[i].release()), SyscallSucceedsWithValue(0))
-          << "num=" << num << " i=" << i;
-    }
-    clients.clear();
-    // We close the listening side and open a new listener. We could instead
-    // drain the accept queue by calling accept() and reuse the listener, but
-    // that is racy as the retransmitted SYNs could get ACKd as we make room in
-    // the accept queue.
-    ASSERT_THAT(close(listen_fd.release()), SyscallSucceedsWithValue(0));
+  // Ensure that we accept all client connections. The waiting connections would
+  // get enqueued as we drain the accept queue.
+  for (int i = 0; i < std::size(established_clients); i++) {
+    SCOPED_TRACE(absl::StrCat("established clients i=", i));
+    accept_connection();
+  }
+
+  // The waiting client connections could be in one of these 2 states:
+  // (1) SYN_SENT: if the SYN was dropped because accept queue was full
+  // (2) ESTABLISHED: if the listener sent back a SYNACK, but may have dropped
+  // the ACK from the client if the accept queue was full (send out a data to
+  // re-send that ACK, to address that case).
+  for (int i = 0; i < std::size(waiting_clients); i++) {
+    SCOPED_TRACE(absl::StrCat("waiting clients i=", i));
+    constexpr int kTimeout = 10000;
+    pollfd pfd = {
+        .fd = waiting_clients[i].get(),
+        .events = POLLOUT,
+    };
+    EXPECT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+    EXPECT_EQ(pfd.revents, POLLOUT);
+    char c;
+    EXPECT_THAT(RetryEINTR(send)(waiting_clients[i].get(), &c, sizeof(c), 0),
+                SyscallSucceedsWithValue(sizeof(c)));
+    accept_connection();
   }
 }
author	Mithun Iyer <iyerm@google.com>	2021-04-05 21:51:31 -0700
committer	gVisor bot <gvisor-bot@google.com>	2021-04-05 21:53:41 -0700
commit	56c69fb0e7f96e5bd5da9e0d29a78e05dd3e2bba (patch)
tree	c231730c05bf555753b19577edf05e7235a9569e
parent	7a7fcf2dbaa7bdcdb9b523358de91c71d5cb05d8 (diff)