diff options
author | Bhasker Hariharan <bhaskerh@google.com> | 2021-02-02 11:03:37 -0800 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2021-02-02 11:05:28 -0800 |
commit | 8c7c5abafbd8a72a43105cc352b42e48c12a99e8 (patch) | |
tree | 99949c5fb992f8af16e686241840ed13683df5a8 | |
parent | 3817c7349de2dde950fd65dcab1f4859c095eeaf (diff) |
Add support for rate limiting out of window ACKs.
Netstack today will send dupACK's with no rate limit for incoming out of
window segments. This can result in ACK loops for example if a TCP socket
connects to itself (actually permitted by TCP). Where the ACK sent in
response to packets being out of order itself gets considered as an out
of window segment resulting in another ACK being generated.
PiperOrigin-RevId: 355206877
-rw-r--r-- | pkg/tcpip/stack/stack.go | 13 | ||||
-rw-r--r-- | pkg/tcpip/stack/stack_options.go | 25 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/connect.go | 7 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint.go | 20 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint_state.go | 10 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/rcv.go | 4 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/snd.go | 10 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/tcp_test.go | 7 | ||||
-rw-r--r-- | test/packetimpact/tests/tcp_outside_the_window_test.go | 18 | ||||
-rw-r--r-- | test/syscalls/linux/tcp_socket.cc | 36 |
10 files changed, 145 insertions, 5 deletions
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 57ad412a1..a51d758d0 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -458,6 +458,18 @@ type Stack struct { // receiveBufferSize holds the min/default/max receive buffer sizes for // endpoints other than TCP. receiveBufferSize ReceiveBufferSizeOption + + // tcpInvalidRateLimit is the maximal rate for sending duplicate + // acknowledgements in response to incoming TCP packets that are for an existing + // connection but that are invalid due to any of the following reasons: + // + // a) out-of-window sequence number. + // b) out-of-window acknowledgement number. + // c) PAWS check failure (when implemented). + // + // This is required to prevent potential ACK loops. + // Setting this to 0 will disable all rate limiting. + tcpInvalidRateLimit time.Duration } // UniqueID is an abstract generator of unique identifiers. @@ -668,6 +680,7 @@ func New(opts Options) *Stack { Default: DefaultBufferSize, Max: DefaultMaxBufferSize, }, + tcpInvalidRateLimit: defaultTCPInvalidRateLimit, } // Add specified network protocols. diff --git a/pkg/tcpip/stack/stack_options.go b/pkg/tcpip/stack/stack_options.go index 8d9b20b7e..3066f4ffd 100644 --- a/pkg/tcpip/stack/stack_options.go +++ b/pkg/tcpip/stack/stack_options.go @@ -15,6 +15,8 @@ package stack import ( + "time" + "gvisor.dev/gvisor/pkg/tcpip" ) @@ -29,6 +31,10 @@ const ( // DefaultMaxBufferSize is the default maximum permitted size of a // send/receive buffer. DefaultMaxBufferSize = 4 << 20 // 4 MiB + + // defaultTCPInvalidRateLimit is the default value for + // stack.TCPInvalidRateLimit. + defaultTCPInvalidRateLimit = 500 * time.Millisecond ) // ReceiveBufferSizeOption is used by stack.(Stack*).Option/SetOption to @@ -39,6 +45,10 @@ type ReceiveBufferSizeOption struct { Max int } +// TCPInvalidRateLimitOption is used by stack.(Stack*).Option/SetOption to get/set +// stack.tcpInvalidRateLimit. +type TCPInvalidRateLimitOption time.Duration + // SetOption allows setting stack wide options. func (s *Stack) SetOption(option interface{}) tcpip.Error { switch v := option.(type) { @@ -74,6 +84,15 @@ func (s *Stack) SetOption(option interface{}) tcpip.Error { s.mu.Unlock() return nil + case TCPInvalidRateLimitOption: + if v < 0 { + return &tcpip.ErrInvalidOptionValue{} + } + s.mu.Lock() + s.tcpInvalidRateLimit = time.Duration(v) + s.mu.Unlock() + return nil + default: return &tcpip.ErrUnknownProtocolOption{} } @@ -94,6 +113,12 @@ func (s *Stack) Option(option interface{}) tcpip.Error { s.mu.RUnlock() return nil + case *TCPInvalidRateLimitOption: + s.mu.RLock() + *v = TCPInvalidRateLimitOption(s.tcpInvalidRateLimit) + s.mu.RUnlock() + return nil + default: return &tcpip.ErrUnknownProtocolOption{} } diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 4695b66d6..34a631b53 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -333,7 +333,9 @@ func (h *handshake) synRcvdState(s *segment) tcpip.Error { // number and "After sending the acknowledgment, drop the unacceptable // segment and return." if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { - h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd) + if h.ep.allowOutOfWindowAck() { + h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd) + } return nil } @@ -1185,8 +1187,7 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err tcpip.Error) { // endpoint MUST terminate its connection. The local TCP endpoint // should then rely on SYN retransmission from the remote end to // re-establish the connection. - - e.snd.sendAck() + e.snd.maybeSendOutOfWindowAck(s) } else if s.flagIsSet(header.TCPFlagAck) { // Patch the window size in the segment according to the // send window scale. diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index e645aa194..4e5a6089f 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -688,6 +688,10 @@ type endpoint struct { // ops is used to get socket level options. ops tcpip.SocketOptions + + // lastOutOfWindowAckTime is the time at which the an ACK was sent in response + // to an out of window segment being received by this endpoint. + lastOutOfWindowAckTime time.Time `state:".(unixTime)"` } // UniqueID implements stack.TransportEndpoint.UniqueID. @@ -3125,3 +3129,19 @@ func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption { Max: ss.Max, } } + +// allowOutOfWindowAck returns true if an out-of-window ACK can be sent now. +func (e *endpoint) allowOutOfWindowAck() bool { + var limit stack.TCPInvalidRateLimitOption + if err := e.stack.Option(&limit); err != nil { + panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err)) + } + + now := time.Now() + if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) { + return false + } + + e.lastOutOfWindowAckTime = now + return true +} diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index c21dbc682..e4368026f 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -308,6 +308,16 @@ func (e *endpoint) loadRecentTSTime(unix unixTime) { e.recentTSTime = time.Unix(unix.second, unix.nano) } +// saveLastOutOfWindowAckTime is invoked by stateify. +func (e *endpoint) saveLastOutOfWindowAckTime() unixTime { + return unixTime{e.lastOutOfWindowAckTime.Unix(), e.lastOutOfWindowAckTime.UnixNano()} +} + +// loadLastOutOfWindowAckTime is invoked by stateify. +func (e *endpoint) loadLastOutOfWindowAckTime(unix unixTime) { + e.lastOutOfWindowAckTime = time.Unix(unix.second, unix.nano) +} + // saveMeasureTime is invoked by stateify. func (r *rcvBufAutoTuneParams) saveMeasureTime() unixTime { return unixTime{r.measureTime.Unix(), r.measureTime.UnixNano()} diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go index 7a7c402c4..a5c82b8fa 100644 --- a/pkg/tcpip/transport/tcp/rcv.go +++ b/pkg/tcpip/transport/tcp/rcv.go @@ -385,7 +385,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo // fails, we ignore the packet: // https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L5591 if r.ep.snd.sndNxt.LessThan(s.ackNumber) { - r.ep.snd.sendAck() + r.ep.snd.maybeSendOutOfWindowAck(s) return true, nil } @@ -454,7 +454,7 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err tcpip.Error) { // send an ACK and stop further processing of the segment. // This is according to RFC 793, page 68. if !r.acceptable(segSeq, segLen) { - r.ep.snd.sendAck() + r.ep.snd.maybeSendOutOfWindowAck(s) return true, nil } diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index dfc8fd248..538edd6cf 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -1548,3 +1548,13 @@ func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags byte, seq return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd) } + +// maybeSendOutOfWindowAck sends an ACK if we are not being rate limited +// currently. +func (s *sender) maybeSendOutOfWindowAck(seg *segment) { + // Data packets are unlikely to be part of an ACK loop. So always send + // an ACK for a packet w/ data. + if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() { + s.sendAck() + } +} diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index da2730e27..cd3c4a027 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -6302,6 +6302,13 @@ func TestReceiveBufferAutoTuning(t *testing.T) { // Enable Auto-tuning. stk := c.Stack() + // Disable out of window rate limiting for this test by setting it to 0 as we + // use out of window ACKs to measure the advertised window. + var tcpInvalidRateLimit stack.TCPInvalidRateLimitOption + if err := stk.SetOption(tcpInvalidRateLimit); err != nil { + t.Fatalf("e.stack.SetOption(%#v) = %s", tcpInvalidRateLimit, err) + } + const receiveBufferSize = 80 << 10 // 80KB. const maxReceiveBufferSize = receiveBufferSize * 10 { diff --git a/test/packetimpact/tests/tcp_outside_the_window_test.go b/test/packetimpact/tests/tcp_outside_the_window_test.go index 1b041932a..8909a348e 100644 --- a/test/packetimpact/tests/tcp_outside_the_window_test.go +++ b/test/packetimpact/tests/tcp_outside_the_window_test.go @@ -84,6 +84,24 @@ func TestTCPOutsideTheWindow(t *testing.T) { if tt.expectACK && err != nil { t.Fatalf("expected an ACK packet within %s but got none: %s", timeout, err) } + // Data packets w/o SYN bits are always acked by Linux. Netstack ACK's data packets + // always right now. So only send a second segment and test for no ACK for packets + // with no data. + if tt.expectACK && tt.payload == nil { + // Sending another out-of-window segment immediately should not trigger + // an ACK if less than 500ms(default rate limit for out-of-window ACKs) + // has passed since the last ACK was sent. + t.Logf("sending another segment") + conn.Send(t, testbench.TCP{ + Flags: testbench.Uint8(tt.tcpFlags), + SeqNum: testbench.Uint32(uint32(conn.LocalSeqNum(t).Add(windowSize))), + }, tt.payload...) + timeout := 3 * time.Second + gotACK, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: localSeqNum}, timeout) + if err == nil { + t.Fatalf("expected no ACK packet but got one: %s", gotACK) + } + } if !tt.expectACK && gotACK != nil { t.Fatalf("expected no ACK packet within %s but got one: %s", timeout, gotACK) } diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc index 9028ab024..f56c50e61 100644 --- a/test/syscalls/linux/tcp_socket.cc +++ b/test/syscalls/linux/tcp_socket.cc @@ -1168,6 +1168,42 @@ TEST_P(SimpleTcpSocketTest, SelfConnectSendRecv_NoRandomSave) { EXPECT_EQ(read_bytes, kBufSz); } +TEST_P(SimpleTcpSocketTest, SelfConnectSend_NoRandomSave) { + // Initialize address to the loopback one. + sockaddr_storage addr = + ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); + socklen_t addrlen = sizeof(addr); + + const FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + + constexpr int max_seg = 256; + ASSERT_THAT( + setsockopt(s.get(), SOL_TCP, TCP_MAXSEG, &max_seg, sizeof(max_seg)), + SyscallSucceeds()); + + ASSERT_THAT(bind(s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallSucceeds()); + // Get the bound port. + ASSERT_THAT( + getsockname(s.get(), reinterpret_cast<struct sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + ASSERT_THAT(RetryEINTR(connect)( + s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallSucceeds()); + + std::vector<char> writebuf(512 << 10); // 512 KiB. + + // Try to send the whole thing. + int n; + ASSERT_THAT(n = SendFd(s.get(), writebuf.data(), writebuf.size(), 0), + SyscallSucceeds()); + + // We should have written the whole thing. + EXPECT_EQ(n, writebuf.size()); + EXPECT_THAT(shutdown(s.get(), SHUT_WR), SyscallSucceedsWithValue(0)); +} + TEST_P(SimpleTcpSocketTest, NonBlockingConnect) { const FileDescriptor listener = ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); |