Fix zero receive window advertisements.

With the recent changes db36d948fa63ce950d94a5e8e9ebc37956543661, we try to balance the receive window advertisements between payload lengths vs segment overhead length. This works fine when segment size are much higher than the overhead, but not otherwise. In cases where the segment length is smaller than the segment overhead, we may end up not advertising zero receive window for long time and end up tail-dropping segments. This is especially pronounced when application socket reads are slow or stopped. In this change we do not grow the right edge of the receive window for smaller segment sizes similar to Linux. Also, we keep track of the socket buffer usage and let the window grow if the application is actively reading data. Fixes #4903 PiperOrigin-RevId: 345832012
author: Mithun Iyer <iyerm@google.com> 2020-12-05 01:45:47 -0800
committer: gVisor bot <gvisor-bot@google.com> 2020-12-05 01:48:00 -0800
commit: 3075ede86edf04117f120b3cf110e699034b759c (patch)
tree: 7c62f7c40bbffc7d032b8f3b1d7b040933b28c94
parent: df2dbe3e38aac73926a51d8045dbb419c6167cf5 (diff)
8 files changed, 188 insertions, 11 deletions
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 31eded0ce..c944dccc0 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -16,6 +16,7 @@ package tcp
 
 import (
 	"encoding/binary"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/rand"
@@ -133,7 +134,7 @@ func FindWndScale(wnd seqnum.Size) int {
 		return 0
 	}
 
-	max := seqnum.Size(0xffff)
+	max := seqnum.Size(math.MaxUint16)
 	s := 0
 	for wnd > max && s < header.MaxWndScale {
 		s++
@@ -818,8 +819,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 	data = data.Clone(nil)
 
 	optLen := len(tf.opts)
-	if tf.rcvWnd > 0xffff {
-		tf.rcvWnd = 0xffff
+	if tf.rcvWnd > math.MaxUint16 {
+		tf.rcvWnd = math.MaxUint16
 	}
 
 	mss := int(gso.MSS)
@@ -863,8 +864,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 // network endpoint and under the provided identity.
 func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error {
 	optLen := len(tf.opts)
-	if tf.rcvWnd > 0xffff {
-		tf.rcvWnd = 0xffff
+	if tf.rcvWnd > math.MaxUint16 {
+		tf.rcvWnd = math.MaxUint16
 	}
 
 	if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 8e0b7c843..f2b1b68da 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -16,6 +16,7 @@ package tcp
 
 import (
 	"container/heap"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -48,6 +49,10 @@ type receiver struct {
 
 	rcvWndScale uint8
 
+	// prevBufused is the snapshot of endpoint rcvBufUsed taken when we
+	// advertise a receive window.
+	prevBufUsed int
+
 	closed bool
 
 	// pendingRcvdSegments is bounded by the receive buffer size of the
@@ -80,9 +85,9 @@ func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
 	// outgoing packets, we should use what we have advertised for acceptability
 	// test.
 	scaledWindowSize := r.rcvWnd >> r.rcvWndScale
-	if scaledWindowSize > 0xffff {
+	if scaledWindowSize > math.MaxUint16 {
 		// This is what we actually put in the Window field.
-		scaledWindowSize = 0xffff
+		scaledWindowSize = math.MaxUint16
 	}
 	advertisedWindowSize := scaledWindowSize << r.rcvWndScale
 	return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize))
@@ -106,6 +111,34 @@ func (r *receiver) currentWindow() (curWnd seqnum.Size) {
 func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
 	newWnd := r.ep.selectWindow()
 	curWnd := r.currentWindow()
+	unackLen := int(r.ep.snd.maxSentAck.Size(r.rcvNxt))
+	bufUsed := r.ep.receiveBufferUsed()
+
+	// Grow the right edge of the window only for payloads larger than the
+	// the segment overhead OR if the application is actively consuming data.
+	//
+	// Avoiding growing the right edge otherwise, addresses a situation below:
+	// An application has been slow in reading data and we have burst of
+	// incoming segments lengths < segment overhead. Here, our available free
+	// memory would reduce drastically when compared to the advertised receive
+	// window.
+	//
+	// For example: With incoming 512 bytes segments, segment overhead of
+	// 552 bytes (at the time of writing this comment), with receive window
+	// starting from 1MB and with rcvAdvWndScale being 1, buffer would reach 0
+	// when the curWnd is still 19436 bytes, because for every incoming segment
+	// newWnd would reduce by (552+512) >> rcvAdvWndScale (current value 1),
+	// while curWnd would reduce by 512 bytes.
+	// Such a situation causes us to keep tail dropping the incoming segments
+	// and never advertise zero receive window to the peer.
+	//
+	// Linux does a similar check for minimal sk_buff size (128):
+	// https://github.com/torvalds/linux/blob/d5beb3140f91b1c8a3d41b14d729aefa4dcc58bc/net/ipv4/tcp_input.c#L783
+	//
+	// Also, if the application is reading the data, we keep growing the right
+	// edge, as we are still advertising a window that we think can be serviced.
+	toGrow := unackLen >= SegSize || bufUsed <= r.prevBufUsed
+
 	// Update rcvAcc only if new window is > previously advertised window. We
 	// should never shrink the acceptable sequence space once it has been
 	// advertised the peer. If we shrink the acceptable sequence space then we
@@ -115,7 +148,7 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
 	// rcvWUP       rcvNxt         rcvAcc          new rcvAcc
 	//               <=====curWnd ===>
 	//               <========= newWnd > curWnd ========= >
-	if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) {
+	if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) && toGrow {
 		// If the new window moves the right edge, then update rcvAcc.
 		r.rcvAcc = r.rcvNxt.Add(seqnum.Size(newWnd))
 	} else {
@@ -130,11 +163,24 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
 	// receiver's estimated RTT.
 	r.rcvWnd = newWnd
 	r.rcvWUP = r.rcvNxt
+	r.prevBufUsed = bufUsed
 	scaledWnd := r.rcvWnd >> r.rcvWndScale
 	if scaledWnd == 0 {
 		// Increment a metric if we are advertising an actual zero window.
 		r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
 	}
+
+	// If we started off with a window larger than what can he held in
+	// the 16bit window field, we ceil the value to the max value.
+	// While ceiling, we still do not want to grow the right edge when
+	// not applicable.
+	if scaledWnd > math.MaxUint16 {
+		if toGrow {
+			scaledWnd = seqnum.Size(math.MaxUint16)
+		} else {
+			scaledWnd = seqnum.Size(uint16(scaledWnd))
+		}
+	}
 	return r.rcvNxt, scaledWnd
 }
 
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 2091989cc..5ef73ec74 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -204,7 +204,7 @@ func (s *segment) payloadSize() int {
 // segMemSize is the amount of memory used to hold the segment data and
 // the associated metadata.
 func (s *segment) segMemSize() int {
-	return segSize + s.data.Size()
+	return SegSize + s.data.Size()
 }
 
 // parse populates the sequence & ack numbers, flags, and window fields of the
diff --git a/pkg/tcpip/transport/tcp/segment_unsafe.go b/pkg/tcpip/transport/tcp/segment_unsafe.go
index 0ab7b8f56..392ff0859 100644
--- a/pkg/tcpip/transport/tcp/segment_unsafe.go
+++ b/pkg/tcpip/transport/tcp/segment_unsafe.go
@@ -19,5 +19,6 @@ import (
 )
 
 const (
-	segSize = int(unsafe.Sizeof(segment{}))
+	// SegSize is the minimal size of the segment overhead.
+	SegSize = int(unsafe.Sizeof(segment{}))
 )
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 98620e6db..1759ebea9 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -6098,10 +6098,13 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	// Introduce a 25ms latency by delaying the first byte.
 	latency := 25 * time.Millisecond
 	time.Sleep(latency)
-	rawEP.SendPacketWithTS([]byte{1}, tsVal)
+	// Send an initial payload with atleast segment overhead size. The receive
+	// window would not grow for smaller segments.
+	rawEP.SendPacketWithTS(make([]byte, tcp.SegSize), tsVal)
 
 	pkt := rawEP.VerifyAndReturnACKWithTS(tsVal)
 	rcvWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize()
+
 	time.Sleep(25 * time.Millisecond)
 
 	// Allocate a large enough payload for the test.
diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl
index 86833eade..c6c95546a 100644
--- a/test/packetimpact/runner/defs.bzl
+++ b/test/packetimpact/runner/defs.bzl
@@ -224,6 +224,9 @@ ALL_TESTS = [
         name = "tcp_user_timeout",
     ),
     PacketimpactTestInfo(
+        name = "tcp_zero_receive_window",
+    ),
+    PacketimpactTestInfo(
         name = "tcp_queue_receive_in_syn_sent",
     ),
     PacketimpactTestInfo(
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index ce21c079a..373ab8d2f 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -366,6 +366,16 @@ packetimpact_testbench(
     ],
 )
 
+packetimpact_testbench(
+    name = "tcp_zero_receive_window",
+    srcs = ["tcp_zero_receive_window_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
 validate_all_tests()
 
 [packetimpact_go_test(
diff --git a/test/packetimpact/tests/tcp_zero_receive_window_test.go b/test/packetimpact/tests/tcp_zero_receive_window_test.go
new file mode 100644
index 000000000..cf0431c57
--- /dev/null
+++ b/test/packetimpact/tests/tcp_zero_receive_window_test.go
@@ -0,0 +1,113 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_zero_receive_window_test
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.Initialize(flag.CommandLine)
+}
+
+// TestZeroReceiveWindow tests if the DUT sends a zero receive window eventually.
+func TestZeroReceiveWindow(t *testing.T) {
+	for _, payloadLen := range []int{64, 512, 1024} {
+		t.Run(fmt.Sprintf("TestZeroReceiveWindow_with_%dbytes_payload", payloadLen), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+			defer dut.Close(t, listenFd)
+			conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close(t)
+
+			conn.Connect(t)
+			acceptFd, _ := dut.Accept(t, listenFd)
+			defer dut.Close(t, acceptFd)
+
+			dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+			samplePayload := &testbench.Payload{Bytes: make([]byte, payloadLen)} //testbench.GenerateRandomPayload(t, payloadLen)}
+			// Expect the DUT to eventually advertize zero receive window.
+			// The test would timeout otherwise.
+			for {
+				conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
+				gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+				if err != nil {
+					t.Fatalf("expected packet was not received: %s", err)
+				}
+				if *gotTCP.WindowSize == 0 {
+					break
+				}
+			}
+		})
+	}
+}
+
+// TestNonZeroReceiveWindow tests for the DUT to never send a zero receive
+// window when the data is being read from the socket buffer.
+func TestNonZeroReceiveWindow(t *testing.T) {
+	for _, payloadLen := range []int{64, 512, 1024} {
+		t.Run(fmt.Sprintf("TestZeroReceiveWindow_with_%dbytes_payload", payloadLen), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+			defer dut.Close(t, listenFd)
+			conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close(t)
+
+			conn.Connect(t)
+			acceptFd, _ := dut.Accept(t, listenFd)
+			defer dut.Close(t, acceptFd)
+
+			dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+			samplePayload := &testbench.Payload{Bytes: testbench.GenerateRandomPayload(t, payloadLen)}
+			var rcvWindow uint16
+			initRcv := false
+			// This loop keeps a running rcvWindow value from the initial ACK for the data
+			// we sent. Once we have received ACKs with non-zero receive windows, we break
+			// the loop.
+			for {
+				conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
+				gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+				if err != nil {
+					t.Fatalf("expected packet was not received: %s", err)
+				}
+				if ret, _, err := dut.RecvWithErrno(context.Background(), t, acceptFd, int32(payloadLen), 0); ret == -1 {
+					t.Fatalf("dut.RecvWithErrno(ctx, t, %d, %d, 0) = %d,_, %s", acceptFd, payloadLen, ret, err)
+				}
+
+				if *gotTCP.WindowSize == 0 {
+					t.Fatalf("expected non-zero receive window.")
+				}
+				if !initRcv {
+					rcvWindow = uint16(*gotTCP.WindowSize)
+					initRcv = true
+				}
+				if rcvWindow <= uint16(payloadLen) {
+					break
+				}
+				rcvWindow -= uint16(payloadLen)
+			}
+		})
+	}
+}
author	Mithun Iyer <iyerm@google.com>	2020-12-05 01:45:47 -0800
committer	gVisor bot <gvisor-bot@google.com>	2020-12-05 01:48:00 -0800
commit	3075ede86edf04117f120b3cf110e699034b759c (patch)
tree	7c62f7c40bbffc7d032b8f3b1d7b040933b28c94
parent	df2dbe3e38aac73926a51d8045dbb419c6167cf5 (diff)