diff options
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_socket.go | 14 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/socket.go | 14 | ||||
-rw-r--r-- | pkg/tcpip/tcpip.go | 20 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/accept.go | 91 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/connect.go | 20 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/dual_stack_test.go | 14 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint.go | 14 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/protocol.go | 77 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/tcp_sack_test.go | 14 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/tcp_test.go | 85 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/tcp_timestamp_test.go | 8 | ||||
-rw-r--r-- | test/packetimpact/runner/defs.bzl | 6 | ||||
-rw-r--r-- | test/packetimpact/tests/BUILD | 20 | ||||
-rw-r--r-- | test/packetimpact/tests/tcp_listen_backlog_test.go | 86 | ||||
-rw-r--r-- | test/packetimpact/tests/tcp_syncookie_test.go | 70 | ||||
-rw-r--r-- | test/syscalls/linux/socket_inet_loopback.cc | 334 |
16 files changed, 527 insertions, 360 deletions
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 9bdf6d3d8..0141e8a96 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -35,12 +35,6 @@ import ( // LINT.IfChange -// minListenBacklog is the minimum reasonable backlog for listening sockets. -const minListenBacklog = 8 - -// maxListenBacklog is the maximum allowed backlog for listening sockets. -const maxListenBacklog = 1024 - // maxAddrLen is the maximum socket address length we're willing to accept. const maxAddrLen = 200 @@ -382,14 +376,6 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, syserror.ENOTSOCK } - // Per Linux, the backlog is silently capped to reasonable values. - if backlog <= 0 { - backlog = minListenBacklog - } - if backlog > maxListenBacklog { - backlog = maxListenBacklog - } - return 0, nil, s.Listen(t, int(backlog)).ToError() } diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index a87a66146..7cc0be892 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -35,12 +35,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" ) -// minListenBacklog is the minimum reasonable backlog for listening sockets. -const minListenBacklog = 8 - -// maxListenBacklog is the maximum allowed backlog for listening sockets. -const maxListenBacklog = 1024 - // maxAddrLen is the maximum socket address length we're willing to accept. const maxAddrLen = 200 @@ -386,14 +380,6 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, syserror.ENOTSOCK } - // Per Linux, the backlog is silently capped to reasonable values. - if backlog <= 0 { - backlog = minListenBacklog - } - if backlog > maxListenBacklog { - backlog = maxListenBacklog - } - return 0, nil, s.Listen(t, int(backlog)).ToError() } diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 87ea09a5e..60de16579 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -786,6 +786,13 @@ func (*TCPRecovery) isGettableTransportProtocolOption() {} func (*TCPRecovery) isSettableTransportProtocolOption() {} +// TCPAlwaysUseSynCookies indicates unconditional usage of syncookies. +type TCPAlwaysUseSynCookies bool + +func (*TCPAlwaysUseSynCookies) isGettableTransportProtocolOption() {} + +func (*TCPAlwaysUseSynCookies) isSettableTransportProtocolOption() {} + const ( // TCPRACKLossDetection indicates RACK is used for loss detection and // recovery. @@ -1020,19 +1027,6 @@ func (*TCPMaxRetriesOption) isGettableTransportProtocolOption() {} func (*TCPMaxRetriesOption) isSettableTransportProtocolOption() {} -// TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify -// the number of endpoints that can be in SYN-RCVD state before the stack -// switches to using SYN cookies. -type TCPSynRcvdCountThresholdOption uint64 - -func (*TCPSynRcvdCountThresholdOption) isGettableSocketOption() {} - -func (*TCPSynRcvdCountThresholdOption) isSettableSocketOption() {} - -func (*TCPSynRcvdCountThresholdOption) isGettableTransportProtocolOption() {} - -func (*TCPSynRcvdCountThresholdOption) isSettableTransportProtocolOption() {} - // TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide // default for number of times SYN is retransmitted before aborting a connect. type TCPSynRetriesOption uint8 diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 025b134e2..7372ebc08 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -51,11 +51,6 @@ const ( // timestamp and the current timestamp. If the difference is greater // than maxTSDiff, the cookie is expired. maxTSDiff = 2 - - // SynRcvdCountThreshold is the default global maximum number of - // connections that are allowed to be in SYN-RCVD state before TCP - // starts using SYN cookies to accept connections. - SynRcvdCountThreshold uint64 = 1000 ) var ( @@ -80,9 +75,6 @@ func encodeMSS(mss uint16) uint32 { type listenContext struct { stack *stack.Stack - // synRcvdCount is a reference to the stack level synRcvdCount. - synRcvdCount *synRcvdCounter - // rcvWnd is the receive window that is sent by this listening context // in the initial SYN-ACK. rcvWnd seqnum.Size @@ -138,11 +130,6 @@ func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, listenEP: listenEP, pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint), } - p, ok := stk.TransportProtocolInstance(ProtocolNumber).(*protocol) - if !ok { - panic(fmt.Sprintf("unable to get TCP protocol instance from stack: %+v", stk)) - } - l.synRcvdCount = p.SynRcvdCounter() rand.Read(l.nonce[0][:]) rand.Read(l.nonce[1][:]) @@ -199,6 +186,14 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu return (v - l.cookieHash(id, cookieTS, 1)) & hashMask, true } +func (l *listenContext) useSynCookies() bool { + var alwaysUseSynCookies tcpip.TCPAlwaysUseSynCookies + if err := l.stack.TransportProtocolOption(header.TCPProtocolNumber, &alwaysUseSynCookies); err != nil { + panic(fmt.Sprintf("TransportProtocolOption(%d, %T) = %s", header.TCPProtocolNumber, alwaysUseSynCookies, err)) + } + return bool(alwaysUseSynCookies) || (l.listenEP != nil && l.listenEP.synRcvdBacklogFull()) +} + // createConnectingEndpoint creates a new endpoint in a connecting state, with // the connection parameters given by the arguments. func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, tcpip.Error) { @@ -307,6 +302,7 @@ func (l *listenContext) startHandshake(s *segment, opts *header.TCPSynOptions, q // Initialize and start the handshake. h := ep.newPassiveHandshake(isn, irs, opts, deferAccept) + h.listenEP = l.listenEP h.start() return h, nil } @@ -485,7 +481,6 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header } go func() { - defer ctx.synRcvdCount.dec() if err := h.complete(); err != nil { e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() @@ -497,24 +492,29 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header h.ep.startAcceptedLoop() e.stack.Stats().TCP.PassiveConnectionOpenings.Increment() e.deliverAccepted(h.ep, false /*withSynCookie*/) - }() // S/R-SAFE: synRcvdCount is the barrier. + }() return nil } -func (e *endpoint) incSynRcvdCount() bool { +func (e *endpoint) synRcvdBacklogFull() bool { e.acceptMu.Lock() - canInc := int(atomic.LoadInt32(&e.synRcvdCount)) < cap(e.acceptedChan) + acceptedChanCap := cap(e.acceptedChan) e.acceptMu.Unlock() - if canInc { - atomic.AddInt32(&e.synRcvdCount, 1) - } - return canInc + // The allocated accepted channel size would always be one greater than the + // listen backlog. But, the SYNRCVD connections count is always checked + // against the listen backlog value for Linux parity reason. + // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/inet_connection_sock.h#L280 + // + // We maintain an equality check here as the synRcvdCount is incremented + // and compared only from a single listener context and the capacity of + // the accepted channel can only increase by a new listen call. + return int(atomic.LoadInt32(&e.synRcvdCount)) == acceptedChanCap-1 } func (e *endpoint) acceptQueueIsFull() bool { e.acceptMu.Lock() - full := len(e.acceptedChan)+int(atomic.LoadInt32(&e.synRcvdCount)) >= cap(e.acceptedChan) + full := len(e.acceptedChan) == cap(e.acceptedChan) e.acceptMu.Unlock() return full } @@ -539,17 +539,13 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err switch { case s.flags == header.TCPFlagSyn: opts := parseSynSegmentOptions(s) - if ctx.synRcvdCount.inc() { - // Only handle the syn if the following conditions hold - // - accept queue is not full. - // - number of connections in synRcvd state is less than the - // backlog. - if !e.acceptQueueIsFull() && e.incSynRcvdCount() { + if !ctx.useSynCookies() { + if !e.acceptQueueIsFull() { s.incRef() + atomic.AddInt32(&e.synRcvdCount, 1) _ = e.handleSynSegment(ctx, s, &opts) return nil } - ctx.synRcvdCount.dec() e.stack.Stats().TCP.ListenOverflowSynDrop.Increment() e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment() e.stack.Stats().DroppedPackets.Increment() @@ -615,25 +611,6 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err return nil } - if !ctx.synRcvdCount.synCookiesInUse() { - // When not using SYN cookies, as per RFC 793, section 3.9, page 64: - // Any acknowledgment is bad if it arrives on a connection still in - // the LISTEN state. An acceptable reset segment should be formed - // for any arriving ACK-bearing segment. The RST should be - // formatted as follows: - // - // <SEQ=SEG.ACK><CTL=RST> - // - // Send a reset as this is an ACK for which there is no - // half open connections and we are not using cookies - // yet. - // - // The only time we should reach here when a connection - // was opened and closed really quickly and a delayed - // ACK was received from the sender. - return replyWithReset(e.stack, s, e.sendTOS, e.ttl) - } - iss := s.ackNumber - 1 irs := s.sequenceNumber - 1 @@ -651,7 +628,23 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err if !ok || int(data) >= len(mssTable) { e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment() e.stack.Stats().DroppedPackets.Increment() - return nil + + // When not using SYN cookies, as per RFC 793, section 3.9, page 64: + // Any acknowledgment is bad if it arrives on a connection still in + // the LISTEN state. An acceptable reset segment should be formed + // for any arriving ACK-bearing segment. The RST should be + // formatted as follows: + // + // <SEQ=SEG.ACK><CTL=RST> + // + // Send a reset as this is an ACK for which there is no + // half open connections and we are not using cookies + // yet. + // + // The only time we should reach here when a connection + // was opened and closed really quickly and a delayed + // ACK was received from the sender. + return replyWithReset(e.stack, s, e.sendTOS, e.ttl) } e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment() // Create newly accepted endpoint and deliver it. diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index a9e978cf6..8f0f0c3e9 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -65,11 +65,12 @@ const ( // NOTE: handshake.ep.mu is held during handshake processing. It is released if // we are going to block and reacquired when we start processing an event. type handshake struct { - ep *endpoint - state handshakeState - active bool - flags header.TCPFlags - ackNum seqnum.Value + ep *endpoint + listenEP *endpoint + state handshakeState + active bool + flags header.TCPFlags + ackNum seqnum.Value // iss is the initial send sequence number, as defined in RFC 793. iss seqnum.Value @@ -394,6 +395,15 @@ func (h *handshake) synRcvdState(s *segment) tcpip.Error { return nil } + // Drop the ACK if the accept queue is full. + // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523 + // We could abort the connection as well with a tunable as in + // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788 + if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() { + listenEP.stack.Stats().DroppedPackets.Increment() + return nil + } + // Update timestamp if required. See RFC7323, section-4.3. if h.ep.sendTSOk && s.parsedOptions.TS { h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber) diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go index f6a16f96e..d6d68f128 100644 --- a/pkg/tcpip/transport/tcp/dual_stack_test.go +++ b/pkg/tcpip/transport/tcp/dual_stack_test.go @@ -565,17 +565,15 @@ func TestV4AcceptOnV4(t *testing.T) { } func testV4ListenClose(t *testing.T, c *context.Context) { - // Set the SynRcvd threshold to zero to force a syn cookie based accept - // to happen. - var opt tcpip.TCPSynRcvdCountThresholdOption + opt := tcpip.TCPAlwaysUseSynCookies(true) if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("setting TCPSynRcvdCountThresholdOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err) + t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) } - const n = uint16(32) + const n = 32 // Start listening. - if err := c.EP.Listen(int(tcp.SynRcvdCountThreshold + 1)); err != nil { + if err := c.EP.Listen(n); err != nil { t.Fatalf("Listen failed: %v", err) } @@ -591,9 +589,9 @@ func testV4ListenClose(t *testing.T, c *context.Context) { }) } - // Each of these ACK's will cause a syn-cookie based connection to be + // Each of these ACKs will cause a syn-cookie based connection to be // accepted and delivered to the listening endpoint. - for i := uint16(0); i < n; i++ { + for i := 0; i < n; i++ { b := c.GetPacket() tcp := header.TCP(header.IPv4(b).Payload()) iss := seqnum.Value(tcp.SequenceNumber()) diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index c5daba232..9438056f9 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -2474,6 +2474,20 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { // Listen puts the endpoint in "listen" mode, which allows it to accept // new connections. func (e *endpoint) Listen(backlog int) tcpip.Error { + if uint32(backlog) > MaxListenBacklog { + // Linux treats incoming backlog as uint with a limit defined by + // sysctl_somaxconn. + // https://github.com/torvalds/linux/blob/7acac4b3196/net/socket.c#L1666 + // + // We use the backlog to allocate a channel of that size, hence enforce + // a hard limit for the backlog. + backlog = MaxListenBacklog + } else { + // Accept one more than the configured listen backlog to keep in parity with + // Linux. Ref, because of missing equality check here: + // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/sock.h#L937 + backlog++ + } err := e.listen(backlog) if err != nil { if !err.IgnoreStats() { diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 2a4667906..230fa6ebe 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -68,6 +68,9 @@ const ( // DefaultSynRetries is the default value for the number of SYN retransmits // before a connect is aborted. DefaultSynRetries = 6 + + // MaxListenBacklog is the maximum limit of listen backlog supported. + MaxListenBacklog = 1024 ) const ( @@ -75,63 +78,6 @@ const ( ccCubic = "cubic" ) -// syncRcvdCounter tracks the number of endpoints in the SYN-RCVD state. The -// value is protected by a mutex so that we can increment only when it's -// guaranteed not to go above a threshold. -type synRcvdCounter struct { - sync.Mutex - value uint64 - pending sync.WaitGroup - threshold uint64 -} - -// inc tries to increment the global number of endpoints in SYN-RCVD state. It -// succeeds if the increment doesn't make the count go beyond the threshold, and -// fails otherwise. -func (s *synRcvdCounter) inc() bool { - s.Lock() - defer s.Unlock() - if s.value >= s.threshold { - return false - } - - s.pending.Add(1) - s.value++ - - return true -} - -// dec atomically decrements the global number of endpoints in SYN-RCVD -// state. It must only be called if a previous call to inc succeeded. -func (s *synRcvdCounter) dec() { - s.Lock() - defer s.Unlock() - s.value-- - s.pending.Done() -} - -// synCookiesInUse returns true if the synRcvdCount is greater than -// SynRcvdCountThreshold. -func (s *synRcvdCounter) synCookiesInUse() bool { - s.Lock() - defer s.Unlock() - return s.value >= s.threshold -} - -// SetThreshold sets synRcvdCounter.Threshold to ths new threshold. -func (s *synRcvdCounter) SetThreshold(threshold uint64) { - s.Lock() - defer s.Unlock() - s.threshold = threshold -} - -// Threshold returns the current value of synRcvdCounter.Threhsold. -func (s *synRcvdCounter) Threshold() uint64 { - s.Lock() - defer s.Unlock() - return s.threshold -} - type protocol struct { stack *stack.Stack @@ -139,6 +85,7 @@ type protocol struct { sackEnabled bool recovery tcpip.TCPRecovery delayEnabled bool + alwaysUseSynCookies bool sendBufferSize tcpip.TCPSendBufferSizeRangeOption recvBufferSize tcpip.TCPReceiveBufferSizeRangeOption congestionControl string @@ -150,7 +97,6 @@ type protocol struct { minRTO time.Duration maxRTO time.Duration maxRetries uint32 - synRcvdCount synRcvdCounter synRetries uint8 dispatcher dispatcher } @@ -373,9 +319,9 @@ func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip p.mu.Unlock() return nil - case *tcpip.TCPSynRcvdCountThresholdOption: + case *tcpip.TCPAlwaysUseSynCookies: p.mu.Lock() - p.synRcvdCount.SetThreshold(uint64(*v)) + p.alwaysUseSynCookies = bool(*v) p.mu.Unlock() return nil @@ -480,9 +426,9 @@ func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Er p.mu.RUnlock() return nil - case *tcpip.TCPSynRcvdCountThresholdOption: + case *tcpip.TCPAlwaysUseSynCookies: p.mu.RLock() - *v = tcpip.TCPSynRcvdCountThresholdOption(p.synRcvdCount.Threshold()) + *v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies) p.mu.RUnlock() return nil @@ -507,12 +453,6 @@ func (p *protocol) Wait() { p.dispatcher.wait() } -// SynRcvdCounter returns a reference to the synRcvdCount for this protocol -// instance. -func (p *protocol) SynRcvdCounter() *synRcvdCounter { - return &p.synRcvdCount -} - // Parse implements stack.TransportProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) bool { return parse.TCP(pkt) @@ -537,7 +477,6 @@ func NewProtocol(s *stack.Stack) stack.TransportProtocol { lingerTimeout: DefaultTCPLingerTimeout, timeWaitTimeout: DefaultTCPTimeWaitTimeout, timeWaitReuse: tcpip.TCPTimeWaitReuseLoopbackOnly, - synRcvdCount: synRcvdCounter{threshold: SynRcvdCountThreshold}, synRetries: DefaultSynRetries, minRTO: MinRTO, maxRTO: MaxRTO, diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go index 81f800cad..20c9761f2 100644 --- a/pkg/tcpip/transport/tcp/tcp_sack_test.go +++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go @@ -160,12 +160,9 @@ func TestSackPermittedAccept(t *testing.T) { defer c.Cleanup() if tc.cookieEnabled { - // Set the SynRcvd threshold to - // zero to force a syn cookie - // based accept to happen. - var opt tcpip.TCPSynRcvdCountThresholdOption + opt := tcpip.TCPAlwaysUseSynCookies(true) if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err) + t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) } } setStackSACKPermitted(t, c, sackEnabled) @@ -235,12 +232,9 @@ func TestSackDisabledAccept(t *testing.T) { defer c.Cleanup() if tc.cookieEnabled { - // Set the SynRcvd threshold to - // zero to force a syn cookie - // based accept to happen. - var opt tcpip.TCPSynRcvdCountThresholdOption + opt := tcpip.TCPAlwaysUseSynCookies(true) if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err) + t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) } } diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 9c23469f2..5605a4390 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -955,11 +955,7 @@ func TestUserSuppliedMSSOnConnect(t *testing.T) { // when completing the handshake for a new TCP connection from a TCP // listening socket. It should be present in the sent TCP SYN-ACK segment. func TestUserSuppliedMSSOnListenAccept(t *testing.T) { - const ( - nonSynCookieAccepts = 2 - totalAccepts = 4 - mtu = 5000 - ) + const mtu = 5000 ips := []struct { name string @@ -1033,12 +1029,6 @@ func TestUserSuppliedMSSOnListenAccept(t *testing.T) { ip.createEP(c) - // Set the SynRcvd threshold to force a syn cookie based accept to happen. - opt := tcpip.TCPSynRcvdCountThresholdOption(nonSynCookieAccepts) - if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err) - } - if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, int(test.setMSS)); err != nil { t.Fatalf("SetSockOptInt(MaxSegOption, %d): %s", test.setMSS, err) } @@ -1048,13 +1038,17 @@ func TestUserSuppliedMSSOnListenAccept(t *testing.T) { t.Fatalf("Bind(%+v): %s:", bindAddr, err) } - if err := c.EP.Listen(totalAccepts); err != nil { - t.Fatalf("Listen(%d): %s:", totalAccepts, err) + backlog := 5 + // Keep the number of client requests twice to the backlog + // such that half of the connections do not use syncookies + // and the other half does. + clientConnects := backlog * 2 + + if err := c.EP.Listen(backlog); err != nil { + t.Fatalf("Listen(%d): %s:", backlog, err) } - // The first nonSynCookieAccepts packets sent will trigger a gorooutine - // based accept. The rest will trigger a cookie based accept. - for i := 0; i < totalAccepts; i++ { + for i := 0; i < clientConnects; i++ { // Send a SYN requests. iss := seqnum.Value(i) srcPort := context.TestPort + uint16(i) @@ -3087,11 +3081,9 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) { c := context.New(t, mtu) defer c.Cleanup() - // Set the SynRcvd threshold to zero to force a syn cookie based accept - // to happen. - opt := tcpip.TCPSynRcvdCountThresholdOption(0) + opt := tcpip.TCPAlwaysUseSynCookies(true) if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err) + t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) } // Create EP and start listening. @@ -5363,7 +5355,7 @@ func TestListenBacklogFull(t *testing.T) { } lastPortOffset := uint16(0) - for ; int(lastPortOffset) < listenBacklog; lastPortOffset++ { + for ; int(lastPortOffset) < listenBacklog+1; lastPortOffset++ { executeHandshake(t, c, context.TestPort+lastPortOffset, false /*synCookieInUse */) } @@ -5671,15 +5663,13 @@ func TestListenSynRcvdQueueFull(t *testing.T) { } // Test acceptance. - // Start listening. - listenBacklog := 1 - if err := c.EP.Listen(listenBacklog); err != nil { + if err := c.EP.Listen(0); err != nil { t.Fatalf("Listen failed: %s", err) } // Send two SYN's the first one should get a SYN-ACK, the // second one should not get any response and is dropped as - // the synRcvd count will be equal to backlog. + // the accept queue is full. irs := seqnum.Value(context.TestInitialSequenceNumber) c.SendPacket(nil, &context.Headers{ SrcPort: context.TestPort, @@ -5701,23 +5691,7 @@ func TestListenSynRcvdQueueFull(t *testing.T) { } checker.IPv4(t, b, checker.TCP(tcpCheckers...)) - // Now execute send one more SYN. The stack should not respond as the backlog - // is full at this point. - // - // NOTE: we did not complete the handshake for the previous one so the - // accept backlog should be empty and there should be one connection in - // synRcvd state. - c.SendPacket(nil, &context.Headers{ - SrcPort: context.TestPort + 1, - DstPort: context.StackPort, - Flags: header.TCPFlagSyn, - SeqNum: seqnum.Value(889), - RcvWnd: 30000, - }) - c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond) - - // Now complete the previous connection and verify that there is a connection - // to accept. + // Now complete the previous connection. // Send ACK. c.SendPacket(nil, &context.Headers{ SrcPort: context.TestPort, @@ -5728,11 +5702,24 @@ func TestListenSynRcvdQueueFull(t *testing.T) { RcvWnd: 30000, }) - // Try to accept the connections in the backlog. + // Verify if that is delivered to the accept queue. we, ch := waiter.NewChannelEntry(nil) c.WQ.EventRegister(&we, waiter.ReadableEvents) defer c.WQ.EventUnregister(&we) + <-ch + + // Now execute send one more SYN. The stack should not respond as the backlog + // is full at this point. + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort + 1, + DstPort: context.StackPort, + Flags: header.TCPFlagSyn, + SeqNum: seqnum.Value(889), + RcvWnd: 30000, + }) + c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond) + // Try to accept the connections in the backlog. newEP, _, err := c.EP.Accept(nil) if _, ok := err.(*tcpip.ErrWouldBlock); ok { // Wait for connection to be established. @@ -5764,11 +5751,6 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) { c := context.New(t, defaultMTU) defer c.Cleanup() - opt := tcpip.TCPSynRcvdCountThresholdOption(1) - if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err) - } - // Create TCP endpoint. var err tcpip.Error c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ) @@ -5781,9 +5763,8 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) { t.Fatalf("Bind failed: %s", err) } - // Start listening. - listenBacklog := 1 - if err := c.EP.Listen(listenBacklog); err != nil { + // Test for SynCookies usage after filling up the backlog. + if err := c.EP.Listen(0); err != nil { t.Fatalf("Listen failed: %s", err) } @@ -6066,7 +6047,7 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) { if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil { t.Fatalf("Bind failed: %s", err) } - if err := c.EP.Listen(1); err != nil { + if err := c.EP.Listen(0); err != nil { t.Fatalf("Listen failed: %s", err) } diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go index 2949588ce..1deb1fe4d 100644 --- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go @@ -139,9 +139,9 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS defer c.Cleanup() if cookieEnabled { - var opt tcpip.TCPSynRcvdCountThresholdOption + opt := tcpip.TCPAlwaysUseSynCookies(true) if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err) + t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) } } @@ -202,9 +202,9 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd defer c.Cleanup() if cookieEnabled { - var opt tcpip.TCPSynRcvdCountThresholdOption + opt := tcpip.TCPAlwaysUseSynCookies(true) if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { - t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err) + t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) } } diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl index 34e83ec49..634c15727 100644 --- a/test/packetimpact/runner/defs.bzl +++ b/test/packetimpact/runner/defs.bzl @@ -246,6 +246,12 @@ ALL_TESTS = [ expect_netstack_failure = True, ), PacketimpactTestInfo( + name = "tcp_listen_backlog", + ), + PacketimpactTestInfo( + name = "tcp_syncookie", + ), + PacketimpactTestInfo( name = "icmpv6_param_problem", ), PacketimpactTestInfo( diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD index 92103c1e9..83ff70951 100644 --- a/test/packetimpact/tests/BUILD +++ b/test/packetimpact/tests/BUILD @@ -385,6 +385,26 @@ packetimpact_testbench( ], ) +packetimpact_testbench( + name = "tcp_listen_backlog", + srcs = ["tcp_listen_backlog_test.go"], + deps = [ + "//pkg/tcpip/header", + "//test/packetimpact/testbench", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +packetimpact_testbench( + name = "tcp_syncookie", + srcs = ["tcp_syncookie_test.go"], + deps = [ + "//pkg/tcpip/header", + "//test/packetimpact/testbench", + "@org_golang_x_sys//unix:go_default_library", + ], +) + validate_all_tests() [packetimpact_go_test( diff --git a/test/packetimpact/tests/tcp_listen_backlog_test.go b/test/packetimpact/tests/tcp_listen_backlog_test.go new file mode 100644 index 000000000..26c812d0a --- /dev/null +++ b/test/packetimpact/tests/tcp_listen_backlog_test.go @@ -0,0 +1,86 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_listen_backlog_test + +import ( + "flag" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/test/packetimpact/testbench" +) + +func init() { + testbench.Initialize(flag.CommandLine) +} + +// TestTCPListenBacklog tests for a listening endpoint behavior: +// (1) reply to more SYNs than what is configured as listen backlog +// (2) ignore ACKs (that complete a handshake) when the accept queue is full +// (3) ignore incoming SYNs when the accept queue is full +func TestTCPListenBacklog(t *testing.T) { + dut := testbench.NewDUT(t) + + // Listening endpoint accepts one more connection than the listen backlog. + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 0 /*backlog*/) + + var establishedConn testbench.TCPIPv4 + var incompleteConn testbench.TCPIPv4 + + // Test if the DUT listener replies to more SYNs than listen backlog+1 + for i, conn := range []*testbench.TCPIPv4{&establishedConn, &incompleteConn} { + *conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + // Expect dut connection to have transitioned to SYN-RCVD state. + conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)}) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil { + t.Fatalf("expected SYN-ACK for %d connection, %s", i, err) + } + } + defer establishedConn.Close(t) + defer incompleteConn.Close(t) + + // Send the ACK to complete handshake. + establishedConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}) + dut.PollOne(t, listenFd, unix.POLLIN, time.Second) + + // Send the ACK to complete handshake, expect this to be ignored by the + // listener. + incompleteConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}) + + // Drain the accept queue to enable poll for subsequent connections on the + // listener. + dut.Accept(t, listenFd) + + // The ACK for the incomplete connection should be ignored by the + // listening endpoint and the poll on listener should now time out. + if pfds := dut.Poll(t, []unix.PollFd{{Fd: listenFd, Events: unix.POLLIN}}, time.Second); len(pfds) != 0 { + t.Fatalf("got dut.Poll(...) = %#v", pfds) + } + + // Re-send the ACK to complete handshake and re-fill the accept-queue. + incompleteConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)}) + dut.PollOne(t, listenFd, unix.POLLIN, time.Second) + + // Now initiate a new connection when the accept queue is full. + connectingConn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + defer connectingConn.Close(t) + // Expect dut connection to drop the SYN and let the client stay in SYN_SENT state. + connectingConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)}) + if got, err := connectingConn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err == nil { + t.Fatalf("expected no SYN-ACK, but got %s", got) + } +} diff --git a/test/packetimpact/tests/tcp_syncookie_test.go b/test/packetimpact/tests/tcp_syncookie_test.go new file mode 100644 index 000000000..1c21c62ff --- /dev/null +++ b/test/packetimpact/tests/tcp_syncookie_test.go @@ -0,0 +1,70 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_syncookie_test + +import ( + "flag" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/test/packetimpact/testbench" +) + +func init() { + testbench.Initialize(flag.CommandLine) +} + +// TestSynCookie test if the DUT listener is replying back using syn cookies. +// The test does not complete the handshake by not sending the ACK to SYNACK. +// When syncookies are not used, this forces the listener to retransmit SYNACK. +// And when syncookies are being used, there is no such retransmit. +func TestTCPSynCookie(t *testing.T) { + dut := testbench.NewDUT(t) + + // Listening endpoint accepts one more connection than the listen backlog. + _, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/) + + var withoutSynCookieConn testbench.TCPIPv4 + var withSynCookieConn testbench.TCPIPv4 + + // Test if the DUT listener replies to more SYNs than listen backlog+1 + for _, conn := range []*testbench.TCPIPv4{&withoutSynCookieConn, &withSynCookieConn} { + *conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + } + defer withoutSynCookieConn.Close(t) + defer withSynCookieConn.Close(t) + + checkSynAck := func(t *testing.T, conn *testbench.TCPIPv4, expectRetransmit bool) { + // Expect dut connection to have transitioned to SYN-RCVD state. + conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)}) + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil { + t.Fatalf("expected SYN-ACK, but got %s", err) + } + + // If the DUT listener is using syn cookies, it will not retransmit SYNACK + got, err := conn.ExpectData(t, &testbench.TCP{SeqNum: testbench.Uint32(uint32(*conn.RemoteSeqNum(t) - 1)), Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, 2*time.Second) + if expectRetransmit && err != nil { + t.Fatalf("expected retransmitted SYN-ACK, but got %s", err) + } + if !expectRetransmit && err == nil { + t.Fatalf("expected no retransmitted SYN-ACK, but got %s", got) + } + } + + t.Run("without syncookies", func(t *testing.T) { checkSynAck(t, &withoutSynCookieConn, true /*expectRetransmit*/) }) + t.Run("with syncookies", func(t *testing.T) { checkSynAck(t, &withSynCookieConn, false /*expectRetransmit*/) }) +} diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc index 597b5bcb1..d391363fb 100644 --- a/test/syscalls/linux/socket_inet_loopback.cc +++ b/test/syscalls/linux/socket_inet_loopback.cc @@ -489,13 +489,6 @@ void TestListenWhileConnect(const TestParam& param, TestAddress const& listener = param.listener; TestAddress const& connector = param.connector; - constexpr int kBacklog = 2; - // Linux completes one more connection than the listen backlog argument. - // To ensure that there is at least one client connection that stays in - // connecting state, keep 2 more client connections than the listen backlog. - // gVisor differs in this behavior though, gvisor.dev/issue/3153. - constexpr int kClients = kBacklog + 2; - // Create the listening socket. FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); @@ -503,6 +496,13 @@ void TestListenWhileConnect(const TestParam& param, ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len), SyscallSucceeds()); + // This test is only interested in deterministically getting a socket in + // connecting state. For that, we use a listen backlog of zero which would + // mean there is exactly one connection that gets established and is enqueued + // to the accept queue. We poll on the listener to ensure that is enqueued. + // After that the subsequent client connect will stay in connecting state as + // the accept queue is full. + constexpr int kBacklog = 0; ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds()); // Get the port bound by the listening socket. @@ -515,42 +515,49 @@ void TestListenWhileConnect(const TestParam& param, sockaddr_storage conn_addr = connector.addr; ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); - std::vector<FileDescriptor> clients; - for (int i = 0; i < kClients; i++) { - FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE( - Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)); - int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr), - connector.addr_len); - if (ret != 0) { - EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS)); - clients.push_back(std::move(client)); - } + FileDescriptor established_client = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); + ASSERT_THAT( + connect(established_client.get(), reinterpret_cast<sockaddr*>(&conn_addr), + connector.addr_len), + SyscallSucceeds()); + + // Ensure that the accept queue has the completed connection. + constexpr int kTimeout = 10000; + pollfd pfd = { + .fd = listen_fd.get(), + .events = POLLIN, + }; + ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1)); + ASSERT_EQ(pfd.revents, POLLIN); + + FileDescriptor connecting_client = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)); + // Keep the last client in connecting state. + int ret = + connect(connecting_client.get(), reinterpret_cast<sockaddr*>(&conn_addr), + connector.addr_len); + if (ret != 0) { + EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS)); } stopListen(listen_fd); - for (auto& client : clients) { - constexpr int kTimeout = 10000; + std::array<std::pair<int, int>, 2> sockets = { + std::make_pair(established_client.get(), ECONNRESET), + std::make_pair(connecting_client.get(), ECONNREFUSED), + }; + for (size_t i = 0; i < sockets.size(); i++) { + SCOPED_TRACE(absl::StrCat("i=", i)); + auto [fd, expected_errno] = sockets[i]; pollfd pfd = { - .fd = client.get(), - .events = POLLIN, + .fd = fd, }; - // When the listening socket is closed, then we expect the remote to reset - // the connection. - ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1)); - ASSERT_EQ(pfd.revents, POLLIN | POLLHUP | POLLERR); + // When the listening socket is closed, the peer would reset the connection. + EXPECT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1)); + EXPECT_EQ(pfd.revents, POLLHUP | POLLERR); char c; - // Subsequent read can fail with: - // ECONNRESET: If the client connection was established and was reset by the - // remote. - // ECONNREFUSED: If the client connection failed to be established. - ASSERT_THAT(read(client.get(), &c, sizeof(c)), - AnyOf(SyscallFailsWithErrno(ECONNRESET), - SyscallFailsWithErrno(ECONNREFUSED))); - // The last client connection would be in connecting (SYN_SENT) state. - if (client.get() == clients[kClients - 1].get()) { - ASSERT_EQ(errno, ECONNREFUSED) << strerror(errno); - } + EXPECT_THAT(read(fd, &c, sizeof(c)), SyscallFailsWithErrno(expected_errno)); } } @@ -570,7 +577,59 @@ TEST_P(SocketInetLoopbackTest, TCPListenShutdownWhileConnect) { // random save as established connections which can't be delivered to the accept // queue because the queue is full are not correctly delivered after restore // causing the last accept to timeout on the restore. -TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) { +TEST_P(SocketInetLoopbackTest, TCPAcceptBacklogSizes_NoRandomSave) { + auto const& param = GetParam(); + + TestAddress const& listener = param.listener; + TestAddress const& connector = param.connector; + + // Create the listening socket. + const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); + sockaddr_storage listen_addr = listener.addr; + ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr), + listener.addr_len), + SyscallSucceeds()); + // Get the port bound by the listening socket. + socklen_t addrlen = listener.addr_len; + ASSERT_THAT(getsockname(listen_fd.get(), + reinterpret_cast<sockaddr*>(&listen_addr), &addrlen), + SyscallSucceeds()); + uint16_t const port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); + std::array<int, 3> backlogs = {-1, 0, 1}; + for (auto& backlog : backlogs) { + ASSERT_THAT(listen(listen_fd.get(), backlog), SyscallSucceeds()); + + int expected_accepts; + if (backlog < 0) { + expected_accepts = 1024; + } else { + expected_accepts = backlog + 1; + } + for (int i = 0; i < expected_accepts; i++) { + SCOPED_TRACE(absl::StrCat("i=", i)); + // Connect to the listening socket. + const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); + sockaddr_storage conn_addr = connector.addr; + ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); + ASSERT_THAT( + RetryEINTR(connect)(conn_fd.get(), + reinterpret_cast<struct sockaddr*>(&conn_addr), + connector.addr_len), + SyscallSucceeds()); + const FileDescriptor accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); + } + } +} + +// TODO(b/157236388): Remove _NoRandomSave once bug is fixed. Test fails w/ +// random save as established connections which can't be delivered to the accept +// queue because the queue is full are not correctly delivered after restore +// causing the last accept to timeout on the restore. +TEST_P(SocketInetLoopbackTest, TCPBacklog_NoRandomSave) { auto const& param = GetParam(); TestAddress const& listener = param.listener; @@ -595,6 +654,7 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) { ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); int i = 0; while (1) { + SCOPED_TRACE(absl::StrCat("i=", i)); int ret; // Connect to the listening socket. @@ -620,103 +680,133 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) { i++; } + int client_conns = i; + int accepted_conns = 0; for (; i != 0; i--) { - // Accept the connection. - // - // We have to assign a name to the accepted socket, as unamed temporary - // objects are destructed upon full evaluation of the expression it is in, - // potentially causing the connecting socket to fail to shutdown properly. - auto accepted = - ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); + SCOPED_TRACE(absl::StrCat("i=", i)); + pollfd pfd = { + .fd = listen_fd.get(), + .events = POLLIN, + }; + // Look for incoming connections to accept. The last connect request could + // be established from the client side, but the ACK of the handshake could + // be dropped by the listener if the accept queue was filled up by the + // previous connect. + int ret; + ASSERT_THAT(ret = poll(&pfd, 1, 3000), SyscallSucceeds()); + if (ret == 0) break; + if (pfd.revents == POLLIN) { + // Accept the connection. + // + // We have to assign a name to the accepted socket, as unamed temporary + // objects are destructed upon full evaluation of the expression it is in, + // potentially causing the connecting socket to fail to shutdown properly. + auto accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); + accepted_conns++; + } } + // We should accept at least listen backlog + 1 connections. As the stack is + // enqueuing established connections to the accept queue, newer SYNs could + // still be replied to causing those client connections would be accepted as + // we start dequeuing the queue. + ASSERT_GE(accepted_conns, kBacklogSize + 1); + ASSERT_GE(client_conns, accepted_conns); } -// Test if the stack completes atmost listen backlog number of client -// connections. It exercises the path of the stack that enqueues completed -// connections to accept queue vs new incoming SYNs. -TEST_P(SocketInetLoopbackTest, TCPConnectBacklog_NoRandomSave) { - const auto& param = GetParam(); - const TestAddress& listener = param.listener; - const TestAddress& connector = param.connector; +// TODO(b/157236388): Remove _NoRandomSave once bug is fixed. Test fails w/ +// random save as established connections which can't be delivered to the accept +// queue because the queue is full are not correctly delivered after restore +// causing the last accept to timeout on the restore. +TEST_P(SocketInetLoopbackTest, TCPBacklogAcceptAll_NoRandomSave) { + auto const& param = GetParam(); + TestAddress const& listener = param.listener; + TestAddress const& connector = param.connector; + // Create the listening socket. + FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); + sockaddr_storage listen_addr = listener.addr; + ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr), + listener.addr_len), + SyscallSucceeds()); constexpr int kBacklog = 1; - // Keep the number of client connections more than the listen backlog. - // Linux completes one more connection than the listen backlog argument. - // gVisor differs in this behavior though, gvisor.dev/issue/3153. - int kClients = kBacklog + 2; - if (IsRunningOnGvisor()) { - kClients--; - } + ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds()); - // Run the following test for few iterations to test race between accept queue - // getting filled with incoming SYNs. - for (int num = 0; num < 10; num++) { - FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( - Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); - sockaddr_storage listen_addr = listener.addr; - ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr), - listener.addr_len), - SyscallSucceeds()); - ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds()); + // Get the port bound by the listening socket. + socklen_t addrlen = listener.addr_len; + ASSERT_THAT(getsockname(listen_fd.get(), + reinterpret_cast<sockaddr*>(&listen_addr), &addrlen), + SyscallSucceeds()); + uint16_t const port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); - socklen_t addrlen = listener.addr_len; - ASSERT_THAT( - getsockname(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr), - &addrlen), - SyscallSucceeds()); - uint16_t const port = - ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); - sockaddr_storage conn_addr = connector.addr; - ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); + sockaddr_storage conn_addr = connector.addr; + ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); - std::vector<FileDescriptor> clients; - // Issue multiple non-blocking client connects. - for (int i = 0; i < kClients; i++) { - FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE( - Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)); - int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr), - connector.addr_len); - if (ret != 0) { - EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS)); - } - clients.push_back(std::move(client)); + // Fill up the accept queue and trigger more client connections which would be + // waiting to be accepted. + std::array<FileDescriptor, kBacklog + 1> established_clients; + for (auto& fd : established_clients) { + fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP)); + ASSERT_THAT(connect(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr), + connector.addr_len), + SyscallSucceeds()); + } + std::array<FileDescriptor, kBacklog> waiting_clients; + for (auto& fd : waiting_clients) { + fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)); + int ret = connect(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr), + connector.addr_len); + if (ret != 0) { + EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS)); } + } - // Now that client connects are issued, wait for the accept queue to get - // filled and ensure no new client connection is completed. - for (int i = 0; i < kClients; i++) { - pollfd pfd = { - .fd = clients[i].get(), - .events = POLLOUT, - }; - if (i < kClients - 1) { - // Poll for client side connection completions with a large timeout. - // We cannot poll on the listener side without calling accept as poll - // stays level triggered with non-zero accept queue length. - // - // Client side poll would not guarantee that the completed connection - // has been enqueued in to the acccept queue, but the fact that the - // listener ACKd the SYN, means that it cannot complete any new incoming - // SYNs when it has already ACKd for > backlog number of SYNs. - ASSERT_THAT(poll(&pfd, 1, 10000), SyscallSucceedsWithValue(1)) - << "num=" << num << " i=" << i << " kClients=" << kClients; - ASSERT_EQ(pfd.revents, POLLOUT) << "num=" << num << " i=" << i; - } else { - // Now that we expect accept queue filled up, ensure that the last - // client connection never completes with a smaller poll timeout. - ASSERT_THAT(poll(&pfd, 1, 1000), SyscallSucceedsWithValue(0)) - << "num=" << num << " i=" << i; - } + auto accept_connection = [&]() { + constexpr int kTimeout = 10000; + pollfd pfd = { + .fd = listen_fd.get(), + .events = POLLIN, + }; + ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1)); + ASSERT_EQ(pfd.revents, POLLIN); + // Accept the connection. + // + // We have to assign a name to the accepted socket, as unamed temporary + // objects are destructed upon full evaluation of the expression it is in, + // potentially causing the connecting socket to fail to shutdown properly. + auto accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); + }; - ASSERT_THAT(close(clients[i].release()), SyscallSucceedsWithValue(0)) - << "num=" << num << " i=" << i; - } - clients.clear(); - // We close the listening side and open a new listener. We could instead - // drain the accept queue by calling accept() and reuse the listener, but - // that is racy as the retransmitted SYNs could get ACKd as we make room in - // the accept queue. - ASSERT_THAT(close(listen_fd.release()), SyscallSucceedsWithValue(0)); + // Ensure that we accept all client connections. The waiting connections would + // get enqueued as we drain the accept queue. + for (int i = 0; i < std::size(established_clients); i++) { + SCOPED_TRACE(absl::StrCat("established clients i=", i)); + accept_connection(); + } + + // The waiting client connections could be in one of these 2 states: + // (1) SYN_SENT: if the SYN was dropped because accept queue was full + // (2) ESTABLISHED: if the listener sent back a SYNACK, but may have dropped + // the ACK from the client if the accept queue was full (send out a data to + // re-send that ACK, to address that case). + for (int i = 0; i < std::size(waiting_clients); i++) { + SCOPED_TRACE(absl::StrCat("waiting clients i=", i)); + constexpr int kTimeout = 10000; + pollfd pfd = { + .fd = waiting_clients[i].get(), + .events = POLLOUT, + }; + EXPECT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1)); + EXPECT_EQ(pfd.revents, POLLOUT); + char c; + EXPECT_THAT(RetryEINTR(send)(waiting_clients[i].get(), &c, sizeof(c), 0), + SyscallSucceedsWithValue(sizeof(c))); + accept_connection(); } } |