From 8c7c5abafbd8a72a43105cc352b42e48c12a99e8 Mon Sep 17 00:00:00 2001 From: Bhasker Hariharan Date: Tue, 2 Feb 2021 11:03:37 -0800 Subject: Add support for rate limiting out of window ACKs. Netstack today will send dupACK's with no rate limit for incoming out of window segments. This can result in ACK loops for example if a TCP socket connects to itself (actually permitted by TCP). Where the ACK sent in response to packets being out of order itself gets considered as an out of window segment resulting in another ACK being generated. PiperOrigin-RevId: 355206877 --- pkg/tcpip/stack/stack.go | 13 +++++++++++++ pkg/tcpip/stack/stack_options.go | 25 +++++++++++++++++++++++++ pkg/tcpip/transport/tcp/connect.go | 7 ++++--- pkg/tcpip/transport/tcp/endpoint.go | 20 ++++++++++++++++++++ pkg/tcpip/transport/tcp/endpoint_state.go | 10 ++++++++++ pkg/tcpip/transport/tcp/rcv.go | 4 ++-- pkg/tcpip/transport/tcp/snd.go | 10 ++++++++++ pkg/tcpip/transport/tcp/tcp_test.go | 7 +++++++ 8 files changed, 91 insertions(+), 5 deletions(-) (limited to 'pkg') diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 57ad412a1..a51d758d0 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -458,6 +458,18 @@ type Stack struct { // receiveBufferSize holds the min/default/max receive buffer sizes for // endpoints other than TCP. receiveBufferSize ReceiveBufferSizeOption + + // tcpInvalidRateLimit is the maximal rate for sending duplicate + // acknowledgements in response to incoming TCP packets that are for an existing + // connection but that are invalid due to any of the following reasons: + // + // a) out-of-window sequence number. + // b) out-of-window acknowledgement number. + // c) PAWS check failure (when implemented). + // + // This is required to prevent potential ACK loops. + // Setting this to 0 will disable all rate limiting. + tcpInvalidRateLimit time.Duration } // UniqueID is an abstract generator of unique identifiers. @@ -668,6 +680,7 @@ func New(opts Options) *Stack { Default: DefaultBufferSize, Max: DefaultMaxBufferSize, }, + tcpInvalidRateLimit: defaultTCPInvalidRateLimit, } // Add specified network protocols. diff --git a/pkg/tcpip/stack/stack_options.go b/pkg/tcpip/stack/stack_options.go index 8d9b20b7e..3066f4ffd 100644 --- a/pkg/tcpip/stack/stack_options.go +++ b/pkg/tcpip/stack/stack_options.go @@ -15,6 +15,8 @@ package stack import ( + "time" + "gvisor.dev/gvisor/pkg/tcpip" ) @@ -29,6 +31,10 @@ const ( // DefaultMaxBufferSize is the default maximum permitted size of a // send/receive buffer. DefaultMaxBufferSize = 4 << 20 // 4 MiB + + // defaultTCPInvalidRateLimit is the default value for + // stack.TCPInvalidRateLimit. + defaultTCPInvalidRateLimit = 500 * time.Millisecond ) // ReceiveBufferSizeOption is used by stack.(Stack*).Option/SetOption to @@ -39,6 +45,10 @@ type ReceiveBufferSizeOption struct { Max int } +// TCPInvalidRateLimitOption is used by stack.(Stack*).Option/SetOption to get/set +// stack.tcpInvalidRateLimit. +type TCPInvalidRateLimitOption time.Duration + // SetOption allows setting stack wide options. func (s *Stack) SetOption(option interface{}) tcpip.Error { switch v := option.(type) { @@ -74,6 +84,15 @@ func (s *Stack) SetOption(option interface{}) tcpip.Error { s.mu.Unlock() return nil + case TCPInvalidRateLimitOption: + if v < 0 { + return &tcpip.ErrInvalidOptionValue{} + } + s.mu.Lock() + s.tcpInvalidRateLimit = time.Duration(v) + s.mu.Unlock() + return nil + default: return &tcpip.ErrUnknownProtocolOption{} } @@ -94,6 +113,12 @@ func (s *Stack) Option(option interface{}) tcpip.Error { s.mu.RUnlock() return nil + case *TCPInvalidRateLimitOption: + s.mu.RLock() + *v = TCPInvalidRateLimitOption(s.tcpInvalidRateLimit) + s.mu.RUnlock() + return nil + default: return &tcpip.ErrUnknownProtocolOption{} } diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 4695b66d6..34a631b53 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -333,7 +333,9 @@ func (h *handshake) synRcvdState(s *segment) tcpip.Error { // number and "After sending the acknowledgment, drop the unacceptable // segment and return." if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { - h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd) + if h.ep.allowOutOfWindowAck() { + h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd) + } return nil } @@ -1185,8 +1187,7 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err tcpip.Error) { // endpoint MUST terminate its connection. The local TCP endpoint // should then rely on SYN retransmission from the remote end to // re-establish the connection. - - e.snd.sendAck() + e.snd.maybeSendOutOfWindowAck(s) } else if s.flagIsSet(header.TCPFlagAck) { // Patch the window size in the segment according to the // send window scale. diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index e645aa194..4e5a6089f 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -688,6 +688,10 @@ type endpoint struct { // ops is used to get socket level options. ops tcpip.SocketOptions + + // lastOutOfWindowAckTime is the time at which the an ACK was sent in response + // to an out of window segment being received by this endpoint. + lastOutOfWindowAckTime time.Time `state:".(unixTime)"` } // UniqueID implements stack.TransportEndpoint.UniqueID. @@ -3125,3 +3129,19 @@ func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption { Max: ss.Max, } } + +// allowOutOfWindowAck returns true if an out-of-window ACK can be sent now. +func (e *endpoint) allowOutOfWindowAck() bool { + var limit stack.TCPInvalidRateLimitOption + if err := e.stack.Option(&limit); err != nil { + panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err)) + } + + now := time.Now() + if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) { + return false + } + + e.lastOutOfWindowAckTime = now + return true +} diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index c21dbc682..e4368026f 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -308,6 +308,16 @@ func (e *endpoint) loadRecentTSTime(unix unixTime) { e.recentTSTime = time.Unix(unix.second, unix.nano) } +// saveLastOutOfWindowAckTime is invoked by stateify. +func (e *endpoint) saveLastOutOfWindowAckTime() unixTime { + return unixTime{e.lastOutOfWindowAckTime.Unix(), e.lastOutOfWindowAckTime.UnixNano()} +} + +// loadLastOutOfWindowAckTime is invoked by stateify. +func (e *endpoint) loadLastOutOfWindowAckTime(unix unixTime) { + e.lastOutOfWindowAckTime = time.Unix(unix.second, unix.nano) +} + // saveMeasureTime is invoked by stateify. func (r *rcvBufAutoTuneParams) saveMeasureTime() unixTime { return unixTime{r.measureTime.Unix(), r.measureTime.UnixNano()} diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go index 7a7c402c4..a5c82b8fa 100644 --- a/pkg/tcpip/transport/tcp/rcv.go +++ b/pkg/tcpip/transport/tcp/rcv.go @@ -385,7 +385,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo // fails, we ignore the packet: // https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L5591 if r.ep.snd.sndNxt.LessThan(s.ackNumber) { - r.ep.snd.sendAck() + r.ep.snd.maybeSendOutOfWindowAck(s) return true, nil } @@ -454,7 +454,7 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err tcpip.Error) { // send an ACK and stop further processing of the segment. // This is according to RFC 793, page 68. if !r.acceptable(segSeq, segLen) { - r.ep.snd.sendAck() + r.ep.snd.maybeSendOutOfWindowAck(s) return true, nil } diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index dfc8fd248..538edd6cf 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -1548,3 +1548,13 @@ func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags byte, seq return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd) } + +// maybeSendOutOfWindowAck sends an ACK if we are not being rate limited +// currently. +func (s *sender) maybeSendOutOfWindowAck(seg *segment) { + // Data packets are unlikely to be part of an ACK loop. So always send + // an ACK for a packet w/ data. + if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() { + s.sendAck() + } +} diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index da2730e27..cd3c4a027 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -6302,6 +6302,13 @@ func TestReceiveBufferAutoTuning(t *testing.T) { // Enable Auto-tuning. stk := c.Stack() + // Disable out of window rate limiting for this test by setting it to 0 as we + // use out of window ACKs to measure the advertised window. + var tcpInvalidRateLimit stack.TCPInvalidRateLimitOption + if err := stk.SetOption(tcpInvalidRateLimit); err != nil { + t.Fatalf("e.stack.SetOption(%#v) = %s", tcpInvalidRateLimit, err) + } + const receiveBufferSize = 80 << 10 // 80KB. const maxReceiveBufferSize = receiveBufferSize * 10 { -- cgit v1.2.3