#1398 - send ACK when available buffer space gets larger than 1 MSS

When receiving data, netstack avoids sending spurious acks. When user does recv() should netstack send ack telling the sender that the window was increased? It depends. Before this patch, netstack _will_ send the ack in the case when window was zero or window >> scale was zero. Basically - when recv space increased from zero. This is not working right with silly-window-avoidance on the sender side. Some network stacks refuse to transmit segments, that will fill the window but are below MSS. Before this patch, this confuses netstack. On one hand if the window was like 3 bytes, netstack will _not_ send ack if the window increases. On the other hand sending party will refuse to transmit 3-byte packet. This patch changes that, making netstack will send an ACK when the available buffer size increases to or above 1*MSS. This will inform other party buffer is large enough, and hopefully uncork it. Signed-off-by: Marek Majkowski <marek@cloudflare.com>
author: Marek Majkowski <marek@cloudflare.com> 2020-01-02 12:09:24 +0100
committer: Marek Majkowski <marek@cloudflare.com> 2020-01-07 20:35:39 +0000
commit: 08a97a6d193f46cd547fadae3bb4125cc788543b (patch)
tree: b828fca0e14c1ab2954c5bfe99a8233979c50340 /pkg/tcpip/transport
parent: 1f384ac42b9ee8b52000dc2bff79d975853519ed (diff)
3 files changed, 161 insertions, 14 deletions
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index fe629aa40..5d42f8045 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -955,10 +955,20 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 	}
 
 	e.rcvBufUsed -= len(v)
+
+	avail := e.receiveBufferAvailableLocked()
+
+	// If the window was small before, lower than MSS, send immediate
+	// ack. Without this, the sender might be stuck  waiting for the
+	// window to grow, while we think the window is non-zero so we
+	// don't need to send acks. To avoid silly window syndrome, send
+	// ack only when the window grows above one MSS.
+	crossedMSS := avail-len(v) < int(e.amss) && avail >= int(e.amss)
+
 	// If the window was zero before this read and if the read freed up
 	// enough buffer space for the scaled window to be non-zero then notify
 	// the protocol goroutine to send a window update.
-	if e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale) {
+	if (e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale)) || crossedMSS {
 		e.zeroWindow = false
 		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
 	}
@@ -1138,11 +1148,8 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 //
 // It must be called with rcvListMu held.
 func (e *endpoint) zeroReceiveWindow(scale uint8) bool {
-	if e.rcvBufUsed >= e.rcvBufSize {
-		return true
-	}
-
-	return ((e.rcvBufSize - e.rcvBufUsed) >> scale) == 0
+	avail := e.receiveBufferAvailableLocked()
+	return (avail >> scale) == 0
 }
 
 // SetSockOptInt sets a socket option.
@@ -1181,9 +1188,16 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
 			size = math.MaxInt32 / 2
 		}
 
+		availBefore := e.receiveBufferAvailableLocked()
 		e.rcvBufSize = size
+		availAfter := e.receiveBufferAvailableLocked()
+
 		e.rcvAutoParams.disabled = true
-		if e.zeroWindow && !e.zeroReceiveWindow(scale) {
+
+		// Immediatelly send ACK in two cases: when the buffer
+		// grows so that it leaves zero-window state, when the
+		// buffer grows from small < MSS to >= MSS.
+		if (e.zeroWindow && !e.zeroReceiveWindow(scale)) || (availBefore < int(e.amss) && availAfter >= int(e.amss)) {
 			e.zeroWindow = false
 			mask |= notifyNonZeroReceiveWindow
 		}
@@ -2229,7 +2243,7 @@ func (e *endpoint) readyToRead(s *segment) {
 		// we set the zero window before we deliver the segment to ensure
 		// that a subsequent read of the segment will correctly trigger
 		// a non-zero notification.
-		if avail := e.receiveBufferAvailableLocked(); avail>>e.rcv.rcvWndScale == 0 {
+		if e.zeroReceiveWindow(e.rcv.rcvWndScale) {
 			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
 			e.zeroWindow = true
 		}
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 0a5534959..05c8488f8 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -98,12 +98,6 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
 // in such cases we may need to send an ack to indicate to our peer that it can
 // resume sending data.
 func (r *receiver) nonZeroWindow() {
-	if (r.rcvAcc-r.rcvNxt)>>r.rcvWndScale != 0 {
-		// We never got around to announcing a zero window size, so we
-		// don't need to immediately announce a nonzero one.
-		return
-	}
-
 	// Immediately send an ack.
 	r.ep.snd.sendAck()
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index e8fe4dab5..a05365c6a 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -6561,3 +6561,142 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
 	}
 }
+
+func TestIncreaseWindowOnReceive(t *testing.T) {
+	// This test ensures that the endpoint sends an ack,
+	// after recv() when the window grows to more than 1 MSS.
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	const rcvBuf = 65535 * 10
+	c.CreateConnected(789, 30000, rcvBuf)
+
+	// Write chunks of ~30000 bytes. It's important that two
+	// payloads make it equal or longer than MSS.
+	remain := rcvBuf
+	sent := 0
+	data := make([]byte, defaultMTU/2)
+	lastWnd := uint16(0)
+
+	for remain > len(data) {
+		c.SendPacket(data, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  seqnum.Value(790 + sent),
+			AckNum:  c.IRS.Add(1),
+			RcvWnd:  30000,
+		})
+		sent += len(data)
+		remain -= len(data)
+
+		lastWnd = uint16(remain)
+		if remain > 0xffff {
+			lastWnd = 0xffff
+		}
+		checker.IPv4(t, c.GetPacket(),
+			checker.PayloadLen(header.TCPMinimumSize),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)+1),
+				checker.AckNum(uint32(790+sent)),
+				checker.Window(lastWnd),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	if lastWnd == 0xffff || lastWnd == 0 {
+		t.Fatalf("expected small, non-zero window: %d", lastWnd)
+	}
+
+	// We now have < 1 MSS in the buffer space.  Read the data! An
+	// ack should be sent in response to that. The window was not
+	// zero, but it grew to larger than MSS.
+	_, _, err := c.EP.Read(nil)
+	if err != nil {
+		t.Fatalf("Read failed: %v", err)
+	}
+
+	_, _, err = c.EP.Read(nil)
+	if err != nil {
+		t.Fatalf("Read failed: %v", err)
+	}
+
+	// After reading two packets, we surely crossed MSS. See the ack:
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+sent)),
+			checker.Window(uint16(0xffff)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestIncreaseWindowOnBufferResize(t *testing.T) {
+	// This test ensures that the endpoint sends an ack,
+	// after available recv buffer grows to more than 1 MSS.
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	const rcvBuf = 65535 * 10
+	c.CreateConnected(789, 30000, rcvBuf)
+
+	// Write chunks of ~30000 bytes. It's important that two
+	// payloads make it equal or longer than MSS.
+	remain := rcvBuf
+	sent := 0
+	data := make([]byte, defaultMTU/2)
+	lastWnd := uint16(0)
+
+	for remain > len(data) {
+		c.SendPacket(data, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  seqnum.Value(790 + sent),
+			AckNum:  c.IRS.Add(1),
+			RcvWnd:  30000,
+		})
+		sent += len(data)
+		remain -= len(data)
+
+		lastWnd = uint16(remain)
+		if remain > 0xffff {
+			lastWnd = 0xffff
+		}
+		checker.IPv4(t, c.GetPacket(),
+			checker.PayloadLen(header.TCPMinimumSize),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)+1),
+				checker.AckNum(uint32(790+sent)),
+				checker.Window(lastWnd),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	if lastWnd == 0xffff || lastWnd == 0 {
+		t.Fatalf("expected small, non-zero window: %d", lastWnd)
+	}
+
+	// Increasing the buffer from should generate an ACK,
+	// since window grew from small value to larger equal MSS
+	c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBuf*2)
+
+	// After reading two packets, we surely crossed MSS. See the ack:
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+sent)),
+			checker.Window(uint16(0xffff)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
author	Marek Majkowski <marek@cloudflare.com>	2020-01-02 12:09:24 +0100
committer	Marek Majkowski <marek@cloudflare.com>	2020-01-07 20:35:39 +0000
commit	08a97a6d193f46cd547fadae3bb4125cc788543b (patch)
tree	b828fca0e14c1ab2954c5bfe99a8233979c50340 /pkg/tcpip/transport
parent	1f384ac42b9ee8b52000dc2bff79d975853519ed (diff)