4 files changed, 94 insertions, 27 deletions
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 189c01c8f..0aaef495d 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1219,12 +1219,6 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
 			return true, nil
 		}
 
-		// Increase counter if after processing the segment we would potentially
-		// advertise a zero window.
-		if crossed, above := e.windowCrossedACKThresholdLocked(-s.segMemSize()); crossed && !above {
-			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
-		}
-
 		// Now check if the received segment has caused us to transition
 		// to a CLOSED state, if yes then terminate processing and do
 		// not invoke the sender.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index bc3b409ba..3bcd3923a 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -248,6 +248,11 @@ type ReceiveErrors struct {
 	// ZeroRcvWindowState is the number of times we advertised
 	// a zero receive window when rcvList is full.
 	ZeroRcvWindowState tcpip.StatCounter
+
+	// WantZeroWindow is the number of times we wanted to advertise a
+	// zero receive window but couldn't because it would have caused
+	// the receive window's right edge to shrink.
+	WantZeroRcvWindow tcpip.StatCounter
 }
 
 // SendErrors collect segment send errors within the transport layer.
@@ -1162,7 +1167,7 @@ func (e *endpoint) cleanupLocked() {
 // wndFromSpace returns the window that we can advertise based on the available
 // receive buffer space.
 func wndFromSpace(space int) int {
-	return space / (1 << rcvAdvWndScale)
+	return space >> rcvAdvWndScale
 }
 
 // initialReceiveWindow returns the initial receive window to advertise in the
@@ -1518,6 +1523,38 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	return num, tcpip.ControlMessages{}, nil
 }
 
+// selectWindowLocked returns the new window without checking for shrinking or scaling
+// applied.
+// Precondition: e.mu and e.rcvListMu must be held.
+func (e *endpoint) selectWindowLocked() (wnd seqnum.Size) {
+	wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked())
+	maxWindow := wndFromSpace(e.rcvBufSize)
+	wndFromUsedBytes := maxWindow - e.rcvBufUsed
+
+	// We take the lesser of the wndFromAvailable and wndFromUsedBytes because in
+	// cases where we receive a lot of small segments the segment overhead is a
+	// lot higher and we can run out socket buffer space before we can fill the
+	// previous window we advertised. In cases where we receive MSS sized or close
+	// MSS sized segments we will probably run out of window space before we
+	// exhaust receive buffer.
+	newWnd := wndFromAvailable
+	if newWnd > wndFromUsedBytes {
+		newWnd = wndFromUsedBytes
+	}
+	if newWnd < 0 {
+		newWnd = 0
+	}
+	return seqnum.Size(newWnd)
+}
+
+// selectWindow invokes selectWindowLocked after acquiring e.rcvListMu.
+func (e *endpoint) selectWindow() (wnd seqnum.Size) {
+	e.rcvListMu.Lock()
+	wnd = e.selectWindowLocked()
+	e.rcvListMu.Unlock()
+	return wnd
+}
+
 // windowCrossedACKThresholdLocked checks if the receive window to be announced
 // would be under aMSS or under the window derived from half receive buffer,
 // whichever smaller. This is useful as a receive side silly window syndrome
@@ -1534,7 +1571,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 //
 // Precondition: e.mu and e.rcvListMu must be held.
 func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) {
-	newAvail := wndFromSpace(e.receiveBufferAvailableLocked())
+	newAvail := int(e.selectWindowLocked())
 	oldAvail := newAvail - deltaBefore
 	if oldAvail < 0 {
 		oldAvail = 0
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 48bf196d8..8e0b7c843 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -43,6 +43,9 @@ type receiver struct {
 	// rcvWnd is the non-scaled receive window last advertised to the peer.
 	rcvWnd seqnum.Size
 
+	// rcvWUP is the rcvNxt value at the last window update sent.
+	rcvWUP seqnum.Value
+
 	rcvWndScale uint8
 
 	closed bool
@@ -64,6 +67,7 @@ func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale
 		rcvNxt:          irs + 1,
 		rcvAcc:          irs.Add(rcvWnd + 1),
 		rcvWnd:          rcvWnd,
+		rcvWUP:          irs + 1,
 		rcvWndScale:     rcvWndScale,
 		lastRcvdAckTime: time.Now(),
 	}
@@ -84,34 +88,54 @@ func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
 	return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize))
 }
 
+// currentWindow returns the available space in the window that was advertised
+// last to our peer.
+func (r *receiver) currentWindow() (curWnd seqnum.Size) {
+	endOfWnd := r.rcvWUP.Add(r.rcvWnd)
+	if endOfWnd.LessThan(r.rcvNxt) {
+		// return 0 if r.rcvNxt is past the end of the previously advertised window.
+		// This can happen because we accept a large segment completely even if
+		// accepting it causes it to partially exceed the advertised window.
+		return 0
+	}
+	return r.rcvNxt.Size(endOfWnd)
+}
+
 // getSendParams returns the parameters needed by the sender when building
 // segments to send.
 func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
-	avail := wndFromSpace(r.ep.receiveBufferAvailable())
-	if avail == 0 {
-		// We have no space available to accept any data, move to zero window
-		// state.
-		r.rcvWnd = 0
-		return r.rcvNxt, 0
-	}
-
-	acc := r.rcvNxt.Add(seqnum.Size(avail))
-	newWnd := r.rcvNxt.Size(acc)
-	curWnd := r.rcvNxt.Size(r.rcvAcc)
-
+	newWnd := r.ep.selectWindow()
+	curWnd := r.currentWindow()
 	// Update rcvAcc only if new window is > previously advertised window. We
 	// should never shrink the acceptable sequence space once it has been
 	// advertised the peer. If we shrink the acceptable sequence space then we
 	// would end up dropping bytes that might already be in flight.
-	if newWnd > curWnd {
-		r.rcvAcc = r.rcvNxt.Add(newWnd)
+	// ====================================================  sequence space.
+	// ^             ^               ^                   ^
+	// rcvWUP       rcvNxt         rcvAcc          new rcvAcc
+	//               <=====curWnd ===>
+	//               <========= newWnd > curWnd ========= >
+	if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) {
+		// If the new window moves the right edge, then update rcvAcc.
+		r.rcvAcc = r.rcvNxt.Add(seqnum.Size(newWnd))
 	} else {
+		if newWnd == 0 {
+			// newWnd is zero but we can't advertise a zero as it would cause window
+			// to shrink so just increment a metric to record this event.
+			r.ep.stats.ReceiveErrors.WantZeroRcvWindow.Increment()
+		}
 		newWnd = curWnd
 	}
 	// Stash away the non-scaled receive window as we use it for measuring
 	// receiver's estimated RTT.
 	r.rcvWnd = newWnd
-	return r.rcvNxt, r.rcvWnd >> r.rcvWndScale
+	r.rcvWUP = r.rcvNxt
+	scaledWnd := r.rcvWnd >> r.rcvWndScale
+	if scaledWnd == 0 {
+		// Increment a metric if we are advertising an actual zero window.
+		r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
+	}
+	return r.rcvNxt, scaledWnd
 }
 
 // nonZeroWindow is called when the receive window grows from zero to nonzero;
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 5b504d0d1..a7149efd0 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -6264,14 +6264,27 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 		rawEP.NextSeqNum--
 		rawEP.SendPacketWithTS(nil, tsVal)
 		rawEP.NextSeqNum++
+
 		if i == 0 {
 			// In the first iteration the receiver based RTT is not
 			// yet known as a result the moderation code should not
 			// increase the advertised window.
 			rawEP.VerifyACKRcvWnd(scaleRcvWnd(curRcvWnd))
 		} else {
-			pkt := c.GetPacket()
-			curRcvWnd = int(header.TCP(header.IPv4(pkt).Payload()).WindowSize()) << c.WindowScale
+			// Read loop above could generate an ACK if the window had dropped to
+			// zero and then read had opened it up.
+			lastACK := c.GetPacket()
+			// Discard any intermediate ACKs and only check the last ACK we get in a
+			// short time period of few ms.
+			for {
+				time.Sleep(1 * time.Millisecond)
+				pkt := c.GetPacketNonBlocking()
+				if pkt == nil {
+					break
+				}
+				lastACK = pkt
+			}
+			curRcvWnd = int(header.TCP(header.IPv4(lastACK).Payload()).WindowSize()) << c.WindowScale
 			// If thew new current window is close maxReceiveBufferSize then terminate
 			// the loop. This can happen before all iterations are done due to timing
 			// differences when running the test.
@@ -7328,7 +7341,7 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
 
 	// Write chunks of ~30000 bytes. It's important that two
 	// payloads make it equal or longer than MSS.
-	remain := rcvBuf * 2
+	remain := rcvBuf
 	sent := 0
 	data := make([]byte, defaultMTU/2)
 
@@ -7343,7 +7356,6 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
 		})
 		sent += len(data)
 		remain -= len(data)
-
 		checker.IPv4(t, c.GetPacket(),
 			checker.PayloadLen(header.TCPMinimumSize),
 			checker.TCP(