summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go14
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/socket.go14
-rw-r--r--pkg/tcpip/tcpip.go20
-rw-r--r--pkg/tcpip/transport/tcp/accept.go91
-rw-r--r--pkg/tcpip/transport/tcp/connect.go20
-rw-r--r--pkg/tcpip/transport/tcp/dual_stack_test.go14
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go14
-rw-r--r--pkg/tcpip/transport/tcp/protocol.go77
-rw-r--r--pkg/tcpip/transport/tcp/tcp_sack_test.go14
-rw-r--r--pkg/tcpip/transport/tcp/tcp_test.go85
-rw-r--r--pkg/tcpip/transport/tcp/tcp_timestamp_test.go8
-rw-r--r--test/packetimpact/runner/defs.bzl6
-rw-r--r--test/packetimpact/tests/BUILD20
-rw-r--r--test/packetimpact/tests/tcp_listen_backlog_test.go86
-rw-r--r--test/packetimpact/tests/tcp_syncookie_test.go70
-rw-r--r--test/syscalls/linux/socket_inet_loopback.cc334
16 files changed, 527 insertions, 360 deletions
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 9bdf6d3d8..0141e8a96 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -35,12 +35,6 @@ import (
// LINT.IfChange
-// minListenBacklog is the minimum reasonable backlog for listening sockets.
-const minListenBacklog = 8
-
-// maxListenBacklog is the maximum allowed backlog for listening sockets.
-const maxListenBacklog = 1024
-
// maxAddrLen is the maximum socket address length we're willing to accept.
const maxAddrLen = 200
@@ -382,14 +376,6 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.ENOTSOCK
}
- // Per Linux, the backlog is silently capped to reasonable values.
- if backlog <= 0 {
- backlog = minListenBacklog
- }
- if backlog > maxListenBacklog {
- backlog = maxListenBacklog
- }
-
return 0, nil, s.Listen(t, int(backlog)).ToError()
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go
index a87a66146..7cc0be892 100644
--- a/pkg/sentry/syscalls/linux/vfs2/socket.go
+++ b/pkg/sentry/syscalls/linux/vfs2/socket.go
@@ -35,12 +35,6 @@ import (
"gvisor.dev/gvisor/pkg/hostarch"
)
-// minListenBacklog is the minimum reasonable backlog for listening sockets.
-const minListenBacklog = 8
-
-// maxListenBacklog is the maximum allowed backlog for listening sockets.
-const maxListenBacklog = 1024
-
// maxAddrLen is the maximum socket address length we're willing to accept.
const maxAddrLen = 200
@@ -386,14 +380,6 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.ENOTSOCK
}
- // Per Linux, the backlog is silently capped to reasonable values.
- if backlog <= 0 {
- backlog = minListenBacklog
- }
- if backlog > maxListenBacklog {
- backlog = maxListenBacklog
- }
-
return 0, nil, s.Listen(t, int(backlog)).ToError()
}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 87ea09a5e..60de16579 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -786,6 +786,13 @@ func (*TCPRecovery) isGettableTransportProtocolOption() {}
func (*TCPRecovery) isSettableTransportProtocolOption() {}
+// TCPAlwaysUseSynCookies indicates unconditional usage of syncookies.
+type TCPAlwaysUseSynCookies bool
+
+func (*TCPAlwaysUseSynCookies) isGettableTransportProtocolOption() {}
+
+func (*TCPAlwaysUseSynCookies) isSettableTransportProtocolOption() {}
+
const (
// TCPRACKLossDetection indicates RACK is used for loss detection and
// recovery.
@@ -1020,19 +1027,6 @@ func (*TCPMaxRetriesOption) isGettableTransportProtocolOption() {}
func (*TCPMaxRetriesOption) isSettableTransportProtocolOption() {}
-// TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify
-// the number of endpoints that can be in SYN-RCVD state before the stack
-// switches to using SYN cookies.
-type TCPSynRcvdCountThresholdOption uint64
-
-func (*TCPSynRcvdCountThresholdOption) isGettableSocketOption() {}
-
-func (*TCPSynRcvdCountThresholdOption) isSettableSocketOption() {}
-
-func (*TCPSynRcvdCountThresholdOption) isGettableTransportProtocolOption() {}
-
-func (*TCPSynRcvdCountThresholdOption) isSettableTransportProtocolOption() {}
-
// TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide
// default for number of times SYN is retransmitted before aborting a connect.
type TCPSynRetriesOption uint8
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 025b134e2..7372ebc08 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -51,11 +51,6 @@ const (
// timestamp and the current timestamp. If the difference is greater
// than maxTSDiff, the cookie is expired.
maxTSDiff = 2
-
- // SynRcvdCountThreshold is the default global maximum number of
- // connections that are allowed to be in SYN-RCVD state before TCP
- // starts using SYN cookies to accept connections.
- SynRcvdCountThreshold uint64 = 1000
)
var (
@@ -80,9 +75,6 @@ func encodeMSS(mss uint16) uint32 {
type listenContext struct {
stack *stack.Stack
- // synRcvdCount is a reference to the stack level synRcvdCount.
- synRcvdCount *synRcvdCounter
-
// rcvWnd is the receive window that is sent by this listening context
// in the initial SYN-ACK.
rcvWnd seqnum.Size
@@ -138,11 +130,6 @@ func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size,
listenEP: listenEP,
pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint),
}
- p, ok := stk.TransportProtocolInstance(ProtocolNumber).(*protocol)
- if !ok {
- panic(fmt.Sprintf("unable to get TCP protocol instance from stack: %+v", stk))
- }
- l.synRcvdCount = p.SynRcvdCounter()
rand.Read(l.nonce[0][:])
rand.Read(l.nonce[1][:])
@@ -199,6 +186,14 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
return (v - l.cookieHash(id, cookieTS, 1)) & hashMask, true
}
+func (l *listenContext) useSynCookies() bool {
+ var alwaysUseSynCookies tcpip.TCPAlwaysUseSynCookies
+ if err := l.stack.TransportProtocolOption(header.TCPProtocolNumber, &alwaysUseSynCookies); err != nil {
+ panic(fmt.Sprintf("TransportProtocolOption(%d, %T) = %s", header.TCPProtocolNumber, alwaysUseSynCookies, err))
+ }
+ return bool(alwaysUseSynCookies) || (l.listenEP != nil && l.listenEP.synRcvdBacklogFull())
+}
+
// createConnectingEndpoint creates a new endpoint in a connecting state, with
// the connection parameters given by the arguments.
func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, tcpip.Error) {
@@ -307,6 +302,7 @@ func (l *listenContext) startHandshake(s *segment, opts *header.TCPSynOptions, q
// Initialize and start the handshake.
h := ep.newPassiveHandshake(isn, irs, opts, deferAccept)
+ h.listenEP = l.listenEP
h.start()
return h, nil
}
@@ -485,7 +481,6 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
}
go func() {
- defer ctx.synRcvdCount.dec()
if err := h.complete(); err != nil {
e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
e.stats.FailedConnectionAttempts.Increment()
@@ -497,24 +492,29 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
h.ep.startAcceptedLoop()
e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
e.deliverAccepted(h.ep, false /*withSynCookie*/)
- }() // S/R-SAFE: synRcvdCount is the barrier.
+ }()
return nil
}
-func (e *endpoint) incSynRcvdCount() bool {
+func (e *endpoint) synRcvdBacklogFull() bool {
e.acceptMu.Lock()
- canInc := int(atomic.LoadInt32(&e.synRcvdCount)) < cap(e.acceptedChan)
+ acceptedChanCap := cap(e.acceptedChan)
e.acceptMu.Unlock()
- if canInc {
- atomic.AddInt32(&e.synRcvdCount, 1)
- }
- return canInc
+ // The allocated accepted channel size would always be one greater than the
+ // listen backlog. But, the SYNRCVD connections count is always checked
+ // against the listen backlog value for Linux parity reason.
+ // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/inet_connection_sock.h#L280
+ //
+ // We maintain an equality check here as the synRcvdCount is incremented
+ // and compared only from a single listener context and the capacity of
+ // the accepted channel can only increase by a new listen call.
+ return int(atomic.LoadInt32(&e.synRcvdCount)) == acceptedChanCap-1
}
func (e *endpoint) acceptQueueIsFull() bool {
e.acceptMu.Lock()
- full := len(e.acceptedChan)+int(atomic.LoadInt32(&e.synRcvdCount)) >= cap(e.acceptedChan)
+ full := len(e.acceptedChan) == cap(e.acceptedChan)
e.acceptMu.Unlock()
return full
}
@@ -539,17 +539,13 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err
switch {
case s.flags == header.TCPFlagSyn:
opts := parseSynSegmentOptions(s)
- if ctx.synRcvdCount.inc() {
- // Only handle the syn if the following conditions hold
- // - accept queue is not full.
- // - number of connections in synRcvd state is less than the
- // backlog.
- if !e.acceptQueueIsFull() && e.incSynRcvdCount() {
+ if !ctx.useSynCookies() {
+ if !e.acceptQueueIsFull() {
s.incRef()
+ atomic.AddInt32(&e.synRcvdCount, 1)
_ = e.handleSynSegment(ctx, s, &opts)
return nil
}
- ctx.synRcvdCount.dec()
e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment()
e.stack.Stats().DroppedPackets.Increment()
@@ -615,25 +611,6 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err
return nil
}
- if !ctx.synRcvdCount.synCookiesInUse() {
- // When not using SYN cookies, as per RFC 793, section 3.9, page 64:
- // Any acknowledgment is bad if it arrives on a connection still in
- // the LISTEN state. An acceptable reset segment should be formed
- // for any arriving ACK-bearing segment. The RST should be
- // formatted as follows:
- //
- // <SEQ=SEG.ACK><CTL=RST>
- //
- // Send a reset as this is an ACK for which there is no
- // half open connections and we are not using cookies
- // yet.
- //
- // The only time we should reach here when a connection
- // was opened and closed really quickly and a delayed
- // ACK was received from the sender.
- return replyWithReset(e.stack, s, e.sendTOS, e.ttl)
- }
-
iss := s.ackNumber - 1
irs := s.sequenceNumber - 1
@@ -651,7 +628,23 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Err
if !ok || int(data) >= len(mssTable) {
e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment()
e.stack.Stats().DroppedPackets.Increment()
- return nil
+
+ // When not using SYN cookies, as per RFC 793, section 3.9, page 64:
+ // Any acknowledgment is bad if it arrives on a connection still in
+ // the LISTEN state. An acceptable reset segment should be formed
+ // for any arriving ACK-bearing segment. The RST should be
+ // formatted as follows:
+ //
+ // <SEQ=SEG.ACK><CTL=RST>
+ //
+ // Send a reset as this is an ACK for which there is no
+ // half open connections and we are not using cookies
+ // yet.
+ //
+ // The only time we should reach here when a connection
+ // was opened and closed really quickly and a delayed
+ // ACK was received from the sender.
+ return replyWithReset(e.stack, s, e.sendTOS, e.ttl)
}
e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment()
// Create newly accepted endpoint and deliver it.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index a9e978cf6..8f0f0c3e9 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -65,11 +65,12 @@ const (
// NOTE: handshake.ep.mu is held during handshake processing. It is released if
// we are going to block and reacquired when we start processing an event.
type handshake struct {
- ep *endpoint
- state handshakeState
- active bool
- flags header.TCPFlags
- ackNum seqnum.Value
+ ep *endpoint
+ listenEP *endpoint
+ state handshakeState
+ active bool
+ flags header.TCPFlags
+ ackNum seqnum.Value
// iss is the initial send sequence number, as defined in RFC 793.
iss seqnum.Value
@@ -394,6 +395,15 @@ func (h *handshake) synRcvdState(s *segment) tcpip.Error {
return nil
}
+ // Drop the ACK if the accept queue is full.
+ // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523
+ // We could abort the connection as well with a tunable as in
+ // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788
+ if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() {
+ listenEP.stack.Stats().DroppedPackets.Increment()
+ return nil
+ }
+
// Update timestamp if required. See RFC7323, section-4.3.
if h.ep.sendTSOk && s.parsedOptions.TS {
h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index f6a16f96e..d6d68f128 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -565,17 +565,15 @@ func TestV4AcceptOnV4(t *testing.T) {
}
func testV4ListenClose(t *testing.T, c *context.Context) {
- // Set the SynRcvd threshold to zero to force a syn cookie based accept
- // to happen.
- var opt tcpip.TCPSynRcvdCountThresholdOption
+ opt := tcpip.TCPAlwaysUseSynCookies(true)
if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- t.Fatalf("setting TCPSynRcvdCountThresholdOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+ t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
}
- const n = uint16(32)
+ const n = 32
// Start listening.
- if err := c.EP.Listen(int(tcp.SynRcvdCountThreshold + 1)); err != nil {
+ if err := c.EP.Listen(n); err != nil {
t.Fatalf("Listen failed: %v", err)
}
@@ -591,9 +589,9 @@ func testV4ListenClose(t *testing.T, c *context.Context) {
})
}
- // Each of these ACK's will cause a syn-cookie based connection to be
+ // Each of these ACKs will cause a syn-cookie based connection to be
// accepted and delivered to the listening endpoint.
- for i := uint16(0); i < n; i++ {
+ for i := 0; i < n; i++ {
b := c.GetPacket()
tcp := header.TCP(header.IPv4(b).Payload())
iss := seqnum.Value(tcp.SequenceNumber())
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index c5daba232..9438056f9 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2474,6 +2474,20 @@ func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error {
// Listen puts the endpoint in "listen" mode, which allows it to accept
// new connections.
func (e *endpoint) Listen(backlog int) tcpip.Error {
+ if uint32(backlog) > MaxListenBacklog {
+ // Linux treats incoming backlog as uint with a limit defined by
+ // sysctl_somaxconn.
+ // https://github.com/torvalds/linux/blob/7acac4b3196/net/socket.c#L1666
+ //
+ // We use the backlog to allocate a channel of that size, hence enforce
+ // a hard limit for the backlog.
+ backlog = MaxListenBacklog
+ } else {
+ // Accept one more than the configured listen backlog to keep in parity with
+ // Linux. Ref, because of missing equality check here:
+ // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/sock.h#L937
+ backlog++
+ }
err := e.listen(backlog)
if err != nil {
if !err.IgnoreStats() {
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 2a4667906..230fa6ebe 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -68,6 +68,9 @@ const (
// DefaultSynRetries is the default value for the number of SYN retransmits
// before a connect is aborted.
DefaultSynRetries = 6
+
+ // MaxListenBacklog is the maximum limit of listen backlog supported.
+ MaxListenBacklog = 1024
)
const (
@@ -75,63 +78,6 @@ const (
ccCubic = "cubic"
)
-// syncRcvdCounter tracks the number of endpoints in the SYN-RCVD state. The
-// value is protected by a mutex so that we can increment only when it's
-// guaranteed not to go above a threshold.
-type synRcvdCounter struct {
- sync.Mutex
- value uint64
- pending sync.WaitGroup
- threshold uint64
-}
-
-// inc tries to increment the global number of endpoints in SYN-RCVD state. It
-// succeeds if the increment doesn't make the count go beyond the threshold, and
-// fails otherwise.
-func (s *synRcvdCounter) inc() bool {
- s.Lock()
- defer s.Unlock()
- if s.value >= s.threshold {
- return false
- }
-
- s.pending.Add(1)
- s.value++
-
- return true
-}
-
-// dec atomically decrements the global number of endpoints in SYN-RCVD
-// state. It must only be called if a previous call to inc succeeded.
-func (s *synRcvdCounter) dec() {
- s.Lock()
- defer s.Unlock()
- s.value--
- s.pending.Done()
-}
-
-// synCookiesInUse returns true if the synRcvdCount is greater than
-// SynRcvdCountThreshold.
-func (s *synRcvdCounter) synCookiesInUse() bool {
- s.Lock()
- defer s.Unlock()
- return s.value >= s.threshold
-}
-
-// SetThreshold sets synRcvdCounter.Threshold to ths new threshold.
-func (s *synRcvdCounter) SetThreshold(threshold uint64) {
- s.Lock()
- defer s.Unlock()
- s.threshold = threshold
-}
-
-// Threshold returns the current value of synRcvdCounter.Threhsold.
-func (s *synRcvdCounter) Threshold() uint64 {
- s.Lock()
- defer s.Unlock()
- return s.threshold
-}
-
type protocol struct {
stack *stack.Stack
@@ -139,6 +85,7 @@ type protocol struct {
sackEnabled bool
recovery tcpip.TCPRecovery
delayEnabled bool
+ alwaysUseSynCookies bool
sendBufferSize tcpip.TCPSendBufferSizeRangeOption
recvBufferSize tcpip.TCPReceiveBufferSizeRangeOption
congestionControl string
@@ -150,7 +97,6 @@ type protocol struct {
minRTO time.Duration
maxRTO time.Duration
maxRetries uint32
- synRcvdCount synRcvdCounter
synRetries uint8
dispatcher dispatcher
}
@@ -373,9 +319,9 @@ func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip
p.mu.Unlock()
return nil
- case *tcpip.TCPSynRcvdCountThresholdOption:
+ case *tcpip.TCPAlwaysUseSynCookies:
p.mu.Lock()
- p.synRcvdCount.SetThreshold(uint64(*v))
+ p.alwaysUseSynCookies = bool(*v)
p.mu.Unlock()
return nil
@@ -480,9 +426,9 @@ func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Er
p.mu.RUnlock()
return nil
- case *tcpip.TCPSynRcvdCountThresholdOption:
+ case *tcpip.TCPAlwaysUseSynCookies:
p.mu.RLock()
- *v = tcpip.TCPSynRcvdCountThresholdOption(p.synRcvdCount.Threshold())
+ *v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies)
p.mu.RUnlock()
return nil
@@ -507,12 +453,6 @@ func (p *protocol) Wait() {
p.dispatcher.wait()
}
-// SynRcvdCounter returns a reference to the synRcvdCount for this protocol
-// instance.
-func (p *protocol) SynRcvdCounter() *synRcvdCounter {
- return &p.synRcvdCount
-}
-
// Parse implements stack.TransportProtocol.Parse.
func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
return parse.TCP(pkt)
@@ -537,7 +477,6 @@ func NewProtocol(s *stack.Stack) stack.TransportProtocol {
lingerTimeout: DefaultTCPLingerTimeout,
timeWaitTimeout: DefaultTCPTimeWaitTimeout,
timeWaitReuse: tcpip.TCPTimeWaitReuseLoopbackOnly,
- synRcvdCount: synRcvdCounter{threshold: SynRcvdCountThreshold},
synRetries: DefaultSynRetries,
minRTO: MinRTO,
maxRTO: MaxRTO,
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index 81f800cad..20c9761f2 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -160,12 +160,9 @@ func TestSackPermittedAccept(t *testing.T) {
defer c.Cleanup()
if tc.cookieEnabled {
- // Set the SynRcvd threshold to
- // zero to force a syn cookie
- // based accept to happen.
- var opt tcpip.TCPSynRcvdCountThresholdOption
+ opt := tcpip.TCPAlwaysUseSynCookies(true)
if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+ t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
}
}
setStackSACKPermitted(t, c, sackEnabled)
@@ -235,12 +232,9 @@ func TestSackDisabledAccept(t *testing.T) {
defer c.Cleanup()
if tc.cookieEnabled {
- // Set the SynRcvd threshold to
- // zero to force a syn cookie
- // based accept to happen.
- var opt tcpip.TCPSynRcvdCountThresholdOption
+ opt := tcpip.TCPAlwaysUseSynCookies(true)
if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+ t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
}
}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 9c23469f2..5605a4390 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -955,11 +955,7 @@ func TestUserSuppliedMSSOnConnect(t *testing.T) {
// when completing the handshake for a new TCP connection from a TCP
// listening socket. It should be present in the sent TCP SYN-ACK segment.
func TestUserSuppliedMSSOnListenAccept(t *testing.T) {
- const (
- nonSynCookieAccepts = 2
- totalAccepts = 4
- mtu = 5000
- )
+ const mtu = 5000
ips := []struct {
name string
@@ -1033,12 +1029,6 @@ func TestUserSuppliedMSSOnListenAccept(t *testing.T) {
ip.createEP(c)
- // Set the SynRcvd threshold to force a syn cookie based accept to happen.
- opt := tcpip.TCPSynRcvdCountThresholdOption(nonSynCookieAccepts)
- if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
- }
-
if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, int(test.setMSS)); err != nil {
t.Fatalf("SetSockOptInt(MaxSegOption, %d): %s", test.setMSS, err)
}
@@ -1048,13 +1038,17 @@ func TestUserSuppliedMSSOnListenAccept(t *testing.T) {
t.Fatalf("Bind(%+v): %s:", bindAddr, err)
}
- if err := c.EP.Listen(totalAccepts); err != nil {
- t.Fatalf("Listen(%d): %s:", totalAccepts, err)
+ backlog := 5
+ // Keep the number of client requests twice to the backlog
+ // such that half of the connections do not use syncookies
+ // and the other half does.
+ clientConnects := backlog * 2
+
+ if err := c.EP.Listen(backlog); err != nil {
+ t.Fatalf("Listen(%d): %s:", backlog, err)
}
- // The first nonSynCookieAccepts packets sent will trigger a gorooutine
- // based accept. The rest will trigger a cookie based accept.
- for i := 0; i < totalAccepts; i++ {
+ for i := 0; i < clientConnects; i++ {
// Send a SYN requests.
iss := seqnum.Value(i)
srcPort := context.TestPort + uint16(i)
@@ -3087,11 +3081,9 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
c := context.New(t, mtu)
defer c.Cleanup()
- // Set the SynRcvd threshold to zero to force a syn cookie based accept
- // to happen.
- opt := tcpip.TCPSynRcvdCountThresholdOption(0)
+ opt := tcpip.TCPAlwaysUseSynCookies(true)
if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+ t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
}
// Create EP and start listening.
@@ -5363,7 +5355,7 @@ func TestListenBacklogFull(t *testing.T) {
}
lastPortOffset := uint16(0)
- for ; int(lastPortOffset) < listenBacklog; lastPortOffset++ {
+ for ; int(lastPortOffset) < listenBacklog+1; lastPortOffset++ {
executeHandshake(t, c, context.TestPort+lastPortOffset, false /*synCookieInUse */)
}
@@ -5671,15 +5663,13 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
}
// Test acceptance.
- // Start listening.
- listenBacklog := 1
- if err := c.EP.Listen(listenBacklog); err != nil {
+ if err := c.EP.Listen(0); err != nil {
t.Fatalf("Listen failed: %s", err)
}
// Send two SYN's the first one should get a SYN-ACK, the
// second one should not get any response and is dropped as
- // the synRcvd count will be equal to backlog.
+ // the accept queue is full.
irs := seqnum.Value(context.TestInitialSequenceNumber)
c.SendPacket(nil, &context.Headers{
SrcPort: context.TestPort,
@@ -5701,23 +5691,7 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
}
checker.IPv4(t, b, checker.TCP(tcpCheckers...))
- // Now execute send one more SYN. The stack should not respond as the backlog
- // is full at this point.
- //
- // NOTE: we did not complete the handshake for the previous one so the
- // accept backlog should be empty and there should be one connection in
- // synRcvd state.
- c.SendPacket(nil, &context.Headers{
- SrcPort: context.TestPort + 1,
- DstPort: context.StackPort,
- Flags: header.TCPFlagSyn,
- SeqNum: seqnum.Value(889),
- RcvWnd: 30000,
- })
- c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
-
- // Now complete the previous connection and verify that there is a connection
- // to accept.
+ // Now complete the previous connection.
// Send ACK.
c.SendPacket(nil, &context.Headers{
SrcPort: context.TestPort,
@@ -5728,11 +5702,24 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
RcvWnd: 30000,
})
- // Try to accept the connections in the backlog.
+ // Verify if that is delivered to the accept queue.
we, ch := waiter.NewChannelEntry(nil)
c.WQ.EventRegister(&we, waiter.ReadableEvents)
defer c.WQ.EventUnregister(&we)
+ <-ch
+
+ // Now execute send one more SYN. The stack should not respond as the backlog
+ // is full at this point.
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: context.TestPort + 1,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagSyn,
+ SeqNum: seqnum.Value(889),
+ RcvWnd: 30000,
+ })
+ c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
+ // Try to accept the connections in the backlog.
newEP, _, err := c.EP.Accept(nil)
if _, ok := err.(*tcpip.ErrWouldBlock); ok {
// Wait for connection to be established.
@@ -5764,11 +5751,6 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
c := context.New(t, defaultMTU)
defer c.Cleanup()
- opt := tcpip.TCPSynRcvdCountThresholdOption(1)
- if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
- }
-
// Create TCP endpoint.
var err tcpip.Error
c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
@@ -5781,9 +5763,8 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
t.Fatalf("Bind failed: %s", err)
}
- // Start listening.
- listenBacklog := 1
- if err := c.EP.Listen(listenBacklog); err != nil {
+ // Test for SynCookies usage after filling up the backlog.
+ if err := c.EP.Listen(0); err != nil {
t.Fatalf("Listen failed: %s", err)
}
@@ -6066,7 +6047,7 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
t.Fatalf("Bind failed: %s", err)
}
- if err := c.EP.Listen(1); err != nil {
+ if err := c.EP.Listen(0); err != nil {
t.Fatalf("Listen failed: %s", err)
}
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index 2949588ce..1deb1fe4d 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -139,9 +139,9 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS
defer c.Cleanup()
if cookieEnabled {
- var opt tcpip.TCPSynRcvdCountThresholdOption
+ opt := tcpip.TCPAlwaysUseSynCookies(true)
if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+ t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
}
}
@@ -202,9 +202,9 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd
defer c.Cleanup()
if cookieEnabled {
- var opt tcpip.TCPSynRcvdCountThresholdOption
+ opt := tcpip.TCPAlwaysUseSynCookies(true)
if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
- t.Fatalf("SetTransportProtocolOption(%d, &%T(%d)): %s", tcp.ProtocolNumber, opt, opt, err)
+ t.Fatalf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
}
}
diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl
index 34e83ec49..634c15727 100644
--- a/test/packetimpact/runner/defs.bzl
+++ b/test/packetimpact/runner/defs.bzl
@@ -246,6 +246,12 @@ ALL_TESTS = [
expect_netstack_failure = True,
),
PacketimpactTestInfo(
+ name = "tcp_listen_backlog",
+ ),
+ PacketimpactTestInfo(
+ name = "tcp_syncookie",
+ ),
+ PacketimpactTestInfo(
name = "icmpv6_param_problem",
),
PacketimpactTestInfo(
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 92103c1e9..83ff70951 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -385,6 +385,26 @@ packetimpact_testbench(
],
)
+packetimpact_testbench(
+ name = "tcp_listen_backlog",
+ srcs = ["tcp_listen_backlog_test.go"],
+ deps = [
+ "//pkg/tcpip/header",
+ "//test/packetimpact/testbench",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+packetimpact_testbench(
+ name = "tcp_syncookie",
+ srcs = ["tcp_syncookie_test.go"],
+ deps = [
+ "//pkg/tcpip/header",
+ "//test/packetimpact/testbench",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
validate_all_tests()
[packetimpact_go_test(
diff --git a/test/packetimpact/tests/tcp_listen_backlog_test.go b/test/packetimpact/tests/tcp_listen_backlog_test.go
new file mode 100644
index 000000000..26c812d0a
--- /dev/null
+++ b/test/packetimpact/tests/tcp_listen_backlog_test.go
@@ -0,0 +1,86 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_listen_backlog_test
+
+import (
+ "flag"
+ "testing"
+ "time"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+ testbench.Initialize(flag.CommandLine)
+}
+
+// TestTCPListenBacklog tests for a listening endpoint behavior:
+// (1) reply to more SYNs than what is configured as listen backlog
+// (2) ignore ACKs (that complete a handshake) when the accept queue is full
+// (3) ignore incoming SYNs when the accept queue is full
+func TestTCPListenBacklog(t *testing.T) {
+ dut := testbench.NewDUT(t)
+
+ // Listening endpoint accepts one more connection than the listen backlog.
+ listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 0 /*backlog*/)
+
+ var establishedConn testbench.TCPIPv4
+ var incompleteConn testbench.TCPIPv4
+
+ // Test if the DUT listener replies to more SYNs than listen backlog+1
+ for i, conn := range []*testbench.TCPIPv4{&establishedConn, &incompleteConn} {
+ *conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+ // Expect dut connection to have transitioned to SYN-RCVD state.
+ conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)})
+ if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+ t.Fatalf("expected SYN-ACK for %d connection, %s", i, err)
+ }
+ }
+ defer establishedConn.Close(t)
+ defer incompleteConn.Close(t)
+
+ // Send the ACK to complete handshake.
+ establishedConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)})
+ dut.PollOne(t, listenFd, unix.POLLIN, time.Second)
+
+ // Send the ACK to complete handshake, expect this to be ignored by the
+ // listener.
+ incompleteConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)})
+
+ // Drain the accept queue to enable poll for subsequent connections on the
+ // listener.
+ dut.Accept(t, listenFd)
+
+ // The ACK for the incomplete connection should be ignored by the
+ // listening endpoint and the poll on listener should now time out.
+ if pfds := dut.Poll(t, []unix.PollFd{{Fd: listenFd, Events: unix.POLLIN}}, time.Second); len(pfds) != 0 {
+ t.Fatalf("got dut.Poll(...) = %#v", pfds)
+ }
+
+ // Re-send the ACK to complete handshake and re-fill the accept-queue.
+ incompleteConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagAck)})
+ dut.PollOne(t, listenFd, unix.POLLIN, time.Second)
+
+ // Now initiate a new connection when the accept queue is full.
+ connectingConn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+ defer connectingConn.Close(t)
+ // Expect dut connection to drop the SYN and let the client stay in SYN_SENT state.
+ connectingConn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)})
+ if got, err := connectingConn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err == nil {
+ t.Fatalf("expected no SYN-ACK, but got %s", got)
+ }
+}
diff --git a/test/packetimpact/tests/tcp_syncookie_test.go b/test/packetimpact/tests/tcp_syncookie_test.go
new file mode 100644
index 000000000..1c21c62ff
--- /dev/null
+++ b/test/packetimpact/tests/tcp_syncookie_test.go
@@ -0,0 +1,70 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_syncookie_test
+
+import (
+ "flag"
+ "testing"
+ "time"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+ testbench.Initialize(flag.CommandLine)
+}
+
+// TestSynCookie test if the DUT listener is replying back using syn cookies.
+// The test does not complete the handshake by not sending the ACK to SYNACK.
+// When syncookies are not used, this forces the listener to retransmit SYNACK.
+// And when syncookies are being used, there is no such retransmit.
+func TestTCPSynCookie(t *testing.T) {
+ dut := testbench.NewDUT(t)
+
+ // Listening endpoint accepts one more connection than the listen backlog.
+ _, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1 /*backlog*/)
+
+ var withoutSynCookieConn testbench.TCPIPv4
+ var withSynCookieConn testbench.TCPIPv4
+
+ // Test if the DUT listener replies to more SYNs than listen backlog+1
+ for _, conn := range []*testbench.TCPIPv4{&withoutSynCookieConn, &withSynCookieConn} {
+ *conn = dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+ }
+ defer withoutSynCookieConn.Close(t)
+ defer withSynCookieConn.Close(t)
+
+ checkSynAck := func(t *testing.T, conn *testbench.TCPIPv4, expectRetransmit bool) {
+ // Expect dut connection to have transitioned to SYN-RCVD state.
+ conn.Send(t, testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn)})
+ if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+ t.Fatalf("expected SYN-ACK, but got %s", err)
+ }
+
+ // If the DUT listener is using syn cookies, it will not retransmit SYNACK
+ got, err := conn.ExpectData(t, &testbench.TCP{SeqNum: testbench.Uint32(uint32(*conn.RemoteSeqNum(t) - 1)), Flags: testbench.TCPFlags(header.TCPFlagSyn | header.TCPFlagAck)}, nil, 2*time.Second)
+ if expectRetransmit && err != nil {
+ t.Fatalf("expected retransmitted SYN-ACK, but got %s", err)
+ }
+ if !expectRetransmit && err == nil {
+ t.Fatalf("expected no retransmitted SYN-ACK, but got %s", got)
+ }
+ }
+
+ t.Run("without syncookies", func(t *testing.T) { checkSynAck(t, &withoutSynCookieConn, true /*expectRetransmit*/) })
+ t.Run("with syncookies", func(t *testing.T) { checkSynAck(t, &withSynCookieConn, false /*expectRetransmit*/) })
+}
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 597b5bcb1..d391363fb 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -489,13 +489,6 @@ void TestListenWhileConnect(const TestParam& param,
TestAddress const& listener = param.listener;
TestAddress const& connector = param.connector;
- constexpr int kBacklog = 2;
- // Linux completes one more connection than the listen backlog argument.
- // To ensure that there is at least one client connection that stays in
- // connecting state, keep 2 more client connections than the listen backlog.
- // gVisor differs in this behavior though, gvisor.dev/issue/3153.
- constexpr int kClients = kBacklog + 2;
-
// Create the listening socket.
FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
@@ -503,6 +496,13 @@ void TestListenWhileConnect(const TestParam& param,
ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
listener.addr_len),
SyscallSucceeds());
+ // This test is only interested in deterministically getting a socket in
+ // connecting state. For that, we use a listen backlog of zero which would
+ // mean there is exactly one connection that gets established and is enqueued
+ // to the accept queue. We poll on the listener to ensure that is enqueued.
+ // After that the subsequent client connect will stay in connecting state as
+ // the accept queue is full.
+ constexpr int kBacklog = 0;
ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
// Get the port bound by the listening socket.
@@ -515,42 +515,49 @@ void TestListenWhileConnect(const TestParam& param,
sockaddr_storage conn_addr = connector.addr;
ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
- std::vector<FileDescriptor> clients;
- for (int i = 0; i < kClients; i++) {
- FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
- Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
- int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
- connector.addr_len);
- if (ret != 0) {
- EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
- clients.push_back(std::move(client));
- }
+ FileDescriptor established_client = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+ ASSERT_THAT(
+ connect(established_client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Ensure that the accept queue has the completed connection.
+ constexpr int kTimeout = 10000;
+ pollfd pfd = {
+ .fd = listen_fd.get(),
+ .events = POLLIN,
+ };
+ ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+ ASSERT_EQ(pfd.revents, POLLIN);
+
+ FileDescriptor connecting_client = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+ // Keep the last client in connecting state.
+ int ret =
+ connect(connecting_client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len);
+ if (ret != 0) {
+ EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
}
stopListen(listen_fd);
- for (auto& client : clients) {
- constexpr int kTimeout = 10000;
+ std::array<std::pair<int, int>, 2> sockets = {
+ std::make_pair(established_client.get(), ECONNRESET),
+ std::make_pair(connecting_client.get(), ECONNREFUSED),
+ };
+ for (size_t i = 0; i < sockets.size(); i++) {
+ SCOPED_TRACE(absl::StrCat("i=", i));
+ auto [fd, expected_errno] = sockets[i];
pollfd pfd = {
- .fd = client.get(),
- .events = POLLIN,
+ .fd = fd,
};
- // When the listening socket is closed, then we expect the remote to reset
- // the connection.
- ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
- ASSERT_EQ(pfd.revents, POLLIN | POLLHUP | POLLERR);
+ // When the listening socket is closed, the peer would reset the connection.
+ EXPECT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+ EXPECT_EQ(pfd.revents, POLLHUP | POLLERR);
char c;
- // Subsequent read can fail with:
- // ECONNRESET: If the client connection was established and was reset by the
- // remote.
- // ECONNREFUSED: If the client connection failed to be established.
- ASSERT_THAT(read(client.get(), &c, sizeof(c)),
- AnyOf(SyscallFailsWithErrno(ECONNRESET),
- SyscallFailsWithErrno(ECONNREFUSED)));
- // The last client connection would be in connecting (SYN_SENT) state.
- if (client.get() == clients[kClients - 1].get()) {
- ASSERT_EQ(errno, ECONNREFUSED) << strerror(errno);
- }
+ EXPECT_THAT(read(fd, &c, sizeof(c)), SyscallFailsWithErrno(expected_errno));
}
}
@@ -570,7 +577,59 @@ TEST_P(SocketInetLoopbackTest, TCPListenShutdownWhileConnect) {
// random save as established connections which can't be delivered to the accept
// queue because the queue is full are not correctly delivered after restore
// causing the last accept to timeout on the restore.
-TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) {
+TEST_P(SocketInetLoopbackTest, TCPAcceptBacklogSizes_NoRandomSave) {
+ auto const& param = GetParam();
+
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+ std::array<int, 3> backlogs = {-1, 0, 1};
+ for (auto& backlog : backlogs) {
+ ASSERT_THAT(listen(listen_fd.get(), backlog), SyscallSucceeds());
+
+ int expected_accepts;
+ if (backlog < 0) {
+ expected_accepts = 1024;
+ } else {
+ expected_accepts = backlog + 1;
+ }
+ for (int i = 0; i < expected_accepts; i++) {
+ SCOPED_TRACE(absl::StrCat("i=", i));
+ // Connect to the listening socket.
+ const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ASSERT_THAT(
+ RetryEINTR(connect)(conn_fd.get(),
+ reinterpret_cast<struct sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+ const FileDescriptor accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+ }
+ }
+}
+
+// TODO(b/157236388): Remove _NoRandomSave once bug is fixed. Test fails w/
+// random save as established connections which can't be delivered to the accept
+// queue because the queue is full are not correctly delivered after restore
+// causing the last accept to timeout on the restore.
+TEST_P(SocketInetLoopbackTest, TCPBacklog_NoRandomSave) {
auto const& param = GetParam();
TestAddress const& listener = param.listener;
@@ -595,6 +654,7 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) {
ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
int i = 0;
while (1) {
+ SCOPED_TRACE(absl::StrCat("i=", i));
int ret;
// Connect to the listening socket.
@@ -620,103 +680,133 @@ TEST_P(SocketInetLoopbackTest, TCPbacklog_NoRandomSave) {
i++;
}
+ int client_conns = i;
+ int accepted_conns = 0;
for (; i != 0; i--) {
- // Accept the connection.
- //
- // We have to assign a name to the accepted socket, as unamed temporary
- // objects are destructed upon full evaluation of the expression it is in,
- // potentially causing the connecting socket to fail to shutdown properly.
- auto accepted =
- ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+ SCOPED_TRACE(absl::StrCat("i=", i));
+ pollfd pfd = {
+ .fd = listen_fd.get(),
+ .events = POLLIN,
+ };
+ // Look for incoming connections to accept. The last connect request could
+ // be established from the client side, but the ACK of the handshake could
+ // be dropped by the listener if the accept queue was filled up by the
+ // previous connect.
+ int ret;
+ ASSERT_THAT(ret = poll(&pfd, 1, 3000), SyscallSucceeds());
+ if (ret == 0) break;
+ if (pfd.revents == POLLIN) {
+ // Accept the connection.
+ //
+ // We have to assign a name to the accepted socket, as unamed temporary
+ // objects are destructed upon full evaluation of the expression it is in,
+ // potentially causing the connecting socket to fail to shutdown properly.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+ accepted_conns++;
+ }
}
+ // We should accept at least listen backlog + 1 connections. As the stack is
+ // enqueuing established connections to the accept queue, newer SYNs could
+ // still be replied to causing those client connections would be accepted as
+ // we start dequeuing the queue.
+ ASSERT_GE(accepted_conns, kBacklogSize + 1);
+ ASSERT_GE(client_conns, accepted_conns);
}
-// Test if the stack completes atmost listen backlog number of client
-// connections. It exercises the path of the stack that enqueues completed
-// connections to accept queue vs new incoming SYNs.
-TEST_P(SocketInetLoopbackTest, TCPConnectBacklog_NoRandomSave) {
- const auto& param = GetParam();
- const TestAddress& listener = param.listener;
- const TestAddress& connector = param.connector;
+// TODO(b/157236388): Remove _NoRandomSave once bug is fixed. Test fails w/
+// random save as established connections which can't be delivered to the accept
+// queue because the queue is full are not correctly delivered after restore
+// causing the last accept to timeout on the restore.
+TEST_P(SocketInetLoopbackTest, TCPBacklogAcceptAll_NoRandomSave) {
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+ // Create the listening socket.
+ FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
constexpr int kBacklog = 1;
- // Keep the number of client connections more than the listen backlog.
- // Linux completes one more connection than the listen backlog argument.
- // gVisor differs in this behavior though, gvisor.dev/issue/3153.
- int kClients = kBacklog + 2;
- if (IsRunningOnGvisor()) {
- kClients--;
- }
+ ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
- // Run the following test for few iterations to test race between accept queue
- // getting filled with incoming SYNs.
- for (int num = 0; num < 10; num++) {
- FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
- Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
- sockaddr_storage listen_addr = listener.addr;
- ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
- listener.addr_len),
- SyscallSucceeds());
- ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
- socklen_t addrlen = listener.addr_len;
- ASSERT_THAT(
- getsockname(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
- &addrlen),
- SyscallSucceeds());
- uint16_t const port =
- ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
- sockaddr_storage conn_addr = connector.addr;
- ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
- std::vector<FileDescriptor> clients;
- // Issue multiple non-blocking client connects.
- for (int i = 0; i < kClients; i++) {
- FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
- Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
- int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
- connector.addr_len);
- if (ret != 0) {
- EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
- }
- clients.push_back(std::move(client));
+ // Fill up the accept queue and trigger more client connections which would be
+ // waiting to be accepted.
+ std::array<FileDescriptor, kBacklog + 1> established_clients;
+ for (auto& fd : established_clients) {
+ fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+ ASSERT_THAT(connect(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+ }
+ std::array<FileDescriptor, kBacklog> waiting_clients;
+ for (auto& fd : waiting_clients) {
+ fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+ int ret = connect(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len);
+ if (ret != 0) {
+ EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
}
+ }
- // Now that client connects are issued, wait for the accept queue to get
- // filled and ensure no new client connection is completed.
- for (int i = 0; i < kClients; i++) {
- pollfd pfd = {
- .fd = clients[i].get(),
- .events = POLLOUT,
- };
- if (i < kClients - 1) {
- // Poll for client side connection completions with a large timeout.
- // We cannot poll on the listener side without calling accept as poll
- // stays level triggered with non-zero accept queue length.
- //
- // Client side poll would not guarantee that the completed connection
- // has been enqueued in to the acccept queue, but the fact that the
- // listener ACKd the SYN, means that it cannot complete any new incoming
- // SYNs when it has already ACKd for > backlog number of SYNs.
- ASSERT_THAT(poll(&pfd, 1, 10000), SyscallSucceedsWithValue(1))
- << "num=" << num << " i=" << i << " kClients=" << kClients;
- ASSERT_EQ(pfd.revents, POLLOUT) << "num=" << num << " i=" << i;
- } else {
- // Now that we expect accept queue filled up, ensure that the last
- // client connection never completes with a smaller poll timeout.
- ASSERT_THAT(poll(&pfd, 1, 1000), SyscallSucceedsWithValue(0))
- << "num=" << num << " i=" << i;
- }
+ auto accept_connection = [&]() {
+ constexpr int kTimeout = 10000;
+ pollfd pfd = {
+ .fd = listen_fd.get(),
+ .events = POLLIN,
+ };
+ ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+ ASSERT_EQ(pfd.revents, POLLIN);
+ // Accept the connection.
+ //
+ // We have to assign a name to the accepted socket, as unamed temporary
+ // objects are destructed upon full evaluation of the expression it is in,
+ // potentially causing the connecting socket to fail to shutdown properly.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+ };
- ASSERT_THAT(close(clients[i].release()), SyscallSucceedsWithValue(0))
- << "num=" << num << " i=" << i;
- }
- clients.clear();
- // We close the listening side and open a new listener. We could instead
- // drain the accept queue by calling accept() and reuse the listener, but
- // that is racy as the retransmitted SYNs could get ACKd as we make room in
- // the accept queue.
- ASSERT_THAT(close(listen_fd.release()), SyscallSucceedsWithValue(0));
+ // Ensure that we accept all client connections. The waiting connections would
+ // get enqueued as we drain the accept queue.
+ for (int i = 0; i < std::size(established_clients); i++) {
+ SCOPED_TRACE(absl::StrCat("established clients i=", i));
+ accept_connection();
+ }
+
+ // The waiting client connections could be in one of these 2 states:
+ // (1) SYN_SENT: if the SYN was dropped because accept queue was full
+ // (2) ESTABLISHED: if the listener sent back a SYNACK, but may have dropped
+ // the ACK from the client if the accept queue was full (send out a data to
+ // re-send that ACK, to address that case).
+ for (int i = 0; i < std::size(waiting_clients); i++) {
+ SCOPED_TRACE(absl::StrCat("waiting clients i=", i));
+ constexpr int kTimeout = 10000;
+ pollfd pfd = {
+ .fd = waiting_clients[i].get(),
+ .events = POLLOUT,
+ };
+ EXPECT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+ EXPECT_EQ(pfd.revents, POLLOUT);
+ char c;
+ EXPECT_THAT(RetryEINTR(send)(waiting_clients[i].get(), &c, sizeof(c), 0),
+ SyscallSucceedsWithValue(sizeof(c)));
+ accept_connection();
}
}