summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/tcpip/stack/stack.go13
-rw-r--r--pkg/tcpip/stack/stack_options.go25
-rw-r--r--pkg/tcpip/transport/tcp/connect.go7
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go20
-rw-r--r--pkg/tcpip/transport/tcp/endpoint_state.go10
-rw-r--r--pkg/tcpip/transport/tcp/rcv.go4
-rw-r--r--pkg/tcpip/transport/tcp/snd.go10
-rw-r--r--pkg/tcpip/transport/tcp/tcp_test.go7
-rw-r--r--test/packetimpact/tests/tcp_outside_the_window_test.go18
-rw-r--r--test/syscalls/linux/tcp_socket.cc36
10 files changed, 145 insertions, 5 deletions
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 57ad412a1..a51d758d0 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -458,6 +458,18 @@ type Stack struct {
// receiveBufferSize holds the min/default/max receive buffer sizes for
// endpoints other than TCP.
receiveBufferSize ReceiveBufferSizeOption
+
+ // tcpInvalidRateLimit is the maximal rate for sending duplicate
+ // acknowledgements in response to incoming TCP packets that are for an existing
+ // connection but that are invalid due to any of the following reasons:
+ //
+ // a) out-of-window sequence number.
+ // b) out-of-window acknowledgement number.
+ // c) PAWS check failure (when implemented).
+ //
+ // This is required to prevent potential ACK loops.
+ // Setting this to 0 will disable all rate limiting.
+ tcpInvalidRateLimit time.Duration
}
// UniqueID is an abstract generator of unique identifiers.
@@ -668,6 +680,7 @@ func New(opts Options) *Stack {
Default: DefaultBufferSize,
Max: DefaultMaxBufferSize,
},
+ tcpInvalidRateLimit: defaultTCPInvalidRateLimit,
}
// Add specified network protocols.
diff --git a/pkg/tcpip/stack/stack_options.go b/pkg/tcpip/stack/stack_options.go
index 8d9b20b7e..3066f4ffd 100644
--- a/pkg/tcpip/stack/stack_options.go
+++ b/pkg/tcpip/stack/stack_options.go
@@ -15,6 +15,8 @@
package stack
import (
+ "time"
+
"gvisor.dev/gvisor/pkg/tcpip"
)
@@ -29,6 +31,10 @@ const (
// DefaultMaxBufferSize is the default maximum permitted size of a
// send/receive buffer.
DefaultMaxBufferSize = 4 << 20 // 4 MiB
+
+ // defaultTCPInvalidRateLimit is the default value for
+ // stack.TCPInvalidRateLimit.
+ defaultTCPInvalidRateLimit = 500 * time.Millisecond
)
// ReceiveBufferSizeOption is used by stack.(Stack*).Option/SetOption to
@@ -39,6 +45,10 @@ type ReceiveBufferSizeOption struct {
Max int
}
+// TCPInvalidRateLimitOption is used by stack.(Stack*).Option/SetOption to get/set
+// stack.tcpInvalidRateLimit.
+type TCPInvalidRateLimitOption time.Duration
+
// SetOption allows setting stack wide options.
func (s *Stack) SetOption(option interface{}) tcpip.Error {
switch v := option.(type) {
@@ -74,6 +84,15 @@ func (s *Stack) SetOption(option interface{}) tcpip.Error {
s.mu.Unlock()
return nil
+ case TCPInvalidRateLimitOption:
+ if v < 0 {
+ return &tcpip.ErrInvalidOptionValue{}
+ }
+ s.mu.Lock()
+ s.tcpInvalidRateLimit = time.Duration(v)
+ s.mu.Unlock()
+ return nil
+
default:
return &tcpip.ErrUnknownProtocolOption{}
}
@@ -94,6 +113,12 @@ func (s *Stack) Option(option interface{}) tcpip.Error {
s.mu.RUnlock()
return nil
+ case *TCPInvalidRateLimitOption:
+ s.mu.RLock()
+ *v = TCPInvalidRateLimitOption(s.tcpInvalidRateLimit)
+ s.mu.RUnlock()
+ return nil
+
default:
return &tcpip.ErrUnknownProtocolOption{}
}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4695b66d6..34a631b53 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -333,7 +333,9 @@ func (h *handshake) synRcvdState(s *segment) tcpip.Error {
// number and "After sending the acknowledgment, drop the unacceptable
// segment and return."
if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
- h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd)
+ if h.ep.allowOutOfWindowAck() {
+ h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd)
+ }
return nil
}
@@ -1185,8 +1187,7 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err tcpip.Error) {
// endpoint MUST terminate its connection. The local TCP endpoint
// should then rely on SYN retransmission from the remote end to
// re-establish the connection.
-
- e.snd.sendAck()
+ e.snd.maybeSendOutOfWindowAck(s)
} else if s.flagIsSet(header.TCPFlagAck) {
// Patch the window size in the segment according to the
// send window scale.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index e645aa194..4e5a6089f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -688,6 +688,10 @@ type endpoint struct {
// ops is used to get socket level options.
ops tcpip.SocketOptions
+
+ // lastOutOfWindowAckTime is the time at which the an ACK was sent in response
+ // to an out of window segment being received by this endpoint.
+ lastOutOfWindowAckTime time.Time `state:".(unixTime)"`
}
// UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -3125,3 +3129,19 @@ func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption {
Max: ss.Max,
}
}
+
+// allowOutOfWindowAck returns true if an out-of-window ACK can be sent now.
+func (e *endpoint) allowOutOfWindowAck() bool {
+ var limit stack.TCPInvalidRateLimitOption
+ if err := e.stack.Option(&limit); err != nil {
+ panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err))
+ }
+
+ now := time.Now()
+ if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) {
+ return false
+ }
+
+ e.lastOutOfWindowAckTime = now
+ return true
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index c21dbc682..e4368026f 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -308,6 +308,16 @@ func (e *endpoint) loadRecentTSTime(unix unixTime) {
e.recentTSTime = time.Unix(unix.second, unix.nano)
}
+// saveLastOutOfWindowAckTime is invoked by stateify.
+func (e *endpoint) saveLastOutOfWindowAckTime() unixTime {
+ return unixTime{e.lastOutOfWindowAckTime.Unix(), e.lastOutOfWindowAckTime.UnixNano()}
+}
+
+// loadLastOutOfWindowAckTime is invoked by stateify.
+func (e *endpoint) loadLastOutOfWindowAckTime(unix unixTime) {
+ e.lastOutOfWindowAckTime = time.Unix(unix.second, unix.nano)
+}
+
// saveMeasureTime is invoked by stateify.
func (r *rcvBufAutoTuneParams) saveMeasureTime() unixTime {
return unixTime{r.measureTime.Unix(), r.measureTime.UnixNano()}
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 7a7c402c4..a5c82b8fa 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -385,7 +385,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
// fails, we ignore the packet:
// https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L5591
if r.ep.snd.sndNxt.LessThan(s.ackNumber) {
- r.ep.snd.sendAck()
+ r.ep.snd.maybeSendOutOfWindowAck(s)
return true, nil
}
@@ -454,7 +454,7 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err tcpip.Error) {
// send an ACK and stop further processing of the segment.
// This is according to RFC 793, page 68.
if !r.acceptable(segSeq, segLen) {
- r.ep.snd.sendAck()
+ r.ep.snd.maybeSendOutOfWindowAck(s)
return true, nil
}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index dfc8fd248..538edd6cf 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -1548,3 +1548,13 @@ func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags byte, seq
return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd)
}
+
+// maybeSendOutOfWindowAck sends an ACK if we are not being rate limited
+// currently.
+func (s *sender) maybeSendOutOfWindowAck(seg *segment) {
+ // Data packets are unlikely to be part of an ACK loop. So always send
+ // an ACK for a packet w/ data.
+ if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() {
+ s.sendAck()
+ }
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index da2730e27..cd3c4a027 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -6302,6 +6302,13 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
// Enable Auto-tuning.
stk := c.Stack()
+ // Disable out of window rate limiting for this test by setting it to 0 as we
+ // use out of window ACKs to measure the advertised window.
+ var tcpInvalidRateLimit stack.TCPInvalidRateLimitOption
+ if err := stk.SetOption(tcpInvalidRateLimit); err != nil {
+ t.Fatalf("e.stack.SetOption(%#v) = %s", tcpInvalidRateLimit, err)
+ }
+
const receiveBufferSize = 80 << 10 // 80KB.
const maxReceiveBufferSize = receiveBufferSize * 10
{
diff --git a/test/packetimpact/tests/tcp_outside_the_window_test.go b/test/packetimpact/tests/tcp_outside_the_window_test.go
index 1b041932a..8909a348e 100644
--- a/test/packetimpact/tests/tcp_outside_the_window_test.go
+++ b/test/packetimpact/tests/tcp_outside_the_window_test.go
@@ -84,6 +84,24 @@ func TestTCPOutsideTheWindow(t *testing.T) {
if tt.expectACK && err != nil {
t.Fatalf("expected an ACK packet within %s but got none: %s", timeout, err)
}
+ // Data packets w/o SYN bits are always acked by Linux. Netstack ACK's data packets
+ // always right now. So only send a second segment and test for no ACK for packets
+ // with no data.
+ if tt.expectACK && tt.payload == nil {
+ // Sending another out-of-window segment immediately should not trigger
+ // an ACK if less than 500ms(default rate limit for out-of-window ACKs)
+ // has passed since the last ACK was sent.
+ t.Logf("sending another segment")
+ conn.Send(t, testbench.TCP{
+ Flags: testbench.Uint8(tt.tcpFlags),
+ SeqNum: testbench.Uint32(uint32(conn.LocalSeqNum(t).Add(windowSize))),
+ }, tt.payload...)
+ timeout := 3 * time.Second
+ gotACK, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: localSeqNum}, timeout)
+ if err == nil {
+ t.Fatalf("expected no ACK packet but got one: %s", gotACK)
+ }
+ }
if !tt.expectACK && gotACK != nil {
t.Fatalf("expected no ACK packet within %s but got one: %s", timeout, gotACK)
}
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 9028ab024..f56c50e61 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1168,6 +1168,42 @@ TEST_P(SimpleTcpSocketTest, SelfConnectSendRecv_NoRandomSave) {
EXPECT_EQ(read_bytes, kBufSz);
}
+TEST_P(SimpleTcpSocketTest, SelfConnectSend_NoRandomSave) {
+ // Initialize address to the loopback one.
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ const FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ constexpr int max_seg = 256;
+ ASSERT_THAT(
+ setsockopt(s.get(), SOL_TCP, TCP_MAXSEG, &max_seg, sizeof(max_seg)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(bind(s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+ // Get the bound port.
+ ASSERT_THAT(
+ getsockname(s.get(), reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ ASSERT_THAT(RetryEINTR(connect)(
+ s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ std::vector<char> writebuf(512 << 10); // 512 KiB.
+
+ // Try to send the whole thing.
+ int n;
+ ASSERT_THAT(n = SendFd(s.get(), writebuf.data(), writebuf.size(), 0),
+ SyscallSucceeds());
+
+ // We should have written the whole thing.
+ EXPECT_EQ(n, writebuf.size());
+ EXPECT_THAT(shutdown(s.get(), SHUT_WR), SyscallSucceedsWithValue(0));
+}
+
TEST_P(SimpleTcpSocketTest, NonBlockingConnect) {
const FileDescriptor listener =
ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));