5 files changed, 230 insertions, 39 deletions
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index e56183e71..119c4c505 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -76,12 +76,16 @@ type TCPCubicState struct {
 // TCPRACKState is used to hold a copy of the internal RACK state when the
 // TCPProbeFunc is invoked.
 type TCPRACKState struct {
-	XmitTime    time.Time
-	EndSequence seqnum.Value
-	FACK        seqnum.Value
-	RTT         time.Duration
-	Reord       bool
-	DSACKSeen   bool
+	XmitTime      time.Time
+	EndSequence   seqnum.Value
+	FACK          seqnum.Value
+	RTT           time.Duration
+	Reord         bool
+	DSACKSeen     bool
+	ReoWnd        time.Duration
+	ReoWndIncr    uint8
+	ReoWndPersist int8
+	RTTSeq        seqnum.Value
 }
 
 // TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9a53f0d56..6e4e26c39 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -3031,12 +3031,16 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 
 	rc := &e.snd.rc
 	s.Sender.RACKState = stack.TCPRACKState{
-		XmitTime:    rc.xmitTime,
-		EndSequence: rc.endSequence,
-		FACK:        rc.fack,
-		RTT:         rc.rtt,
-		Reord:       rc.reorderSeen,
-		DSACKSeen:   rc.dsackSeen,
+		XmitTime:      rc.xmitTime,
+		EndSequence:   rc.endSequence,
+		FACK:          rc.fack,
+		RTT:           rc.rtt,
+		Reord:         rc.reorderSeen,
+		DSACKSeen:     rc.dsackSeen,
+		ReoWnd:        rc.reoWnd,
+		ReoWndIncr:    rc.reoWndIncr,
+		ReoWndPersist: rc.reoWndPersist,
+		RTTSeq:        rc.rttSeq,
 	}
 	return s
 }
diff --git a/pkg/tcpip/transport/tcp/rack.go b/pkg/tcpip/transport/tcp/rack.go
index c0578f2d6..d85cb405a 100644
--- a/pkg/tcpip/transport/tcp/rack.go
+++ b/pkg/tcpip/transport/tcp/rack.go
@@ -22,12 +22,21 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 )
 
-// wcDelayedACKTimeout is the recommended maximum delayed ACK timer value as
-// defined in https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.
-// It stands for worst case delayed ACK timer (WCDelAckT). When FlightSize is
-// 1, PTO is inflated by WCDelAckT time to compensate for a potential long
-// delayed ACK timer at the receiver.
-const wcDelayedACKTimeout = 200 * time.Millisecond
+const (
+	// wcDelayedACKTimeout is the recommended maximum delayed ACK timer
+	// value as defined in the RFC. It stands for worst case delayed ACK
+	// timer (WCDelAckT). When FlightSize is 1, PTO is inflated by
+	// WCDelAckT time to compensate for a potential long delayed ACK timer
+	// at the receiver.
+	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.
+	wcDelayedACKTimeout = 200 * time.Millisecond
+
+	// tcpRACKRecoveryThreshold is the number of loss recoveries for which
+	// the reorder window is inflated and after that the reorder window is
+	// reset to its initial value of minRTT/4.
+	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2.
+	tcpRACKRecoveryThreshold = 16
+)
 
 // RACK is a loss detection algorithm used in TCP to detect packet loss and
 // reordering using transmission timestamp of the packets instead of packet or
@@ -44,6 +53,11 @@ type rackControl struct {
 	// endSequence is the ending TCP sequence number of rackControl.seg.
 	endSequence seqnum.Value
 
+	// exitedRecovery indicates if the connection is exiting loss recovery.
+	// This flag is set if the sender is leaving the recovery after
+	// receiving an ACK and is reset during updating of reorder window.
+	exitedRecovery bool
+
 	// fack is the highest selectively or cumulatively acknowledged
 	// sequence.
 	fack seqnum.Value
@@ -51,15 +65,30 @@ type rackControl struct {
 	// minRTT is the estimated minimum RTT of the connection.
 	minRTT time.Duration
 
+	// reorderSeen indicates if reordering has been detected on this
+	// connection.
+	reorderSeen bool
+
+	// reoWnd is the reordering window time used for recording packet
+	// transmission times. It is used to defer the moment at which RACK
+	// marks a packet lost.
+	reoWnd time.Duration
+
+	// reoWndIncr is the multiplier applied to adjust reorder window.
+	reoWndIncr uint8
+
+	// reoWndPersist is the number of loss recoveries before resetting
+	// reorder window.
+	reoWndPersist int8
+
 	// rtt is the RTT of the most recently delivered packet on the
 	// connection (either cumulatively acknowledged or selectively
 	// acknowledged) that was not marked invalid as a possible spurious
 	// retransmission.
 	rtt time.Duration
 
-	// reorderSeen indicates if reordering has been detected on this
-	// connection.
-	reorderSeen bool
+	// rttSeq is the SND.NXT when rtt is updated.
+	rttSeq seqnum.Value
 
 	// xmitTime is the latest transmission timestamp of rackControl.seg.
 	xmitTime time.Time `state:".(unixTime)"`
@@ -75,29 +104,36 @@ type rackControl struct {
 	// tlpHighRxt the value of sender.sndNxt at the time of sending
 	// a TLP retransmission.
 	tlpHighRxt seqnum.Value
+
+	// snd is a reference to the sender.
+	snd *sender
 }
 
 // init initializes RACK specific fields.
-func (rc *rackControl) init() {
+func (rc *rackControl) init(snd *sender, iss seqnum.Value) {
+	rc.fack = iss
+	rc.reoWndIncr = 1
+	rc.snd = snd
 	rc.probeTimer.init(&rc.probeWaker)
 }
 
 // update will update the RACK related fields when an ACK has been received.
-// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
-func (rc *rackControl) update(seg *segment, ackSeg *segment, offset uint32) {
+// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2
+func (rc *rackControl) update(seg *segment, ackSeg *segment) {
 	rtt := time.Now().Sub(seg.xmitTime)
+	tsOffset := rc.snd.ep.tsOffset
 
 	// If the ACK is for a retransmitted packet, do not update if it is a
 	// spurious inference which is determined by below checks:
-	// 1. When Timestamping option is available, if the TSVal is less than the
-	// transmit time of the most recent retransmitted packet.
+	// 1. When Timestamping option is available, if the TSVal is less than
+	// the transmit time of the most recent retransmitted packet.
 	// 2. When RTT calculated for the packet is less than the smoothed RTT
 	// for the connection.
 	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
 	// step 2
 	if seg.xmitCount > 1 {
 		if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 {
-			if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, offset) {
+			if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, tsOffset) {
 				return
 			}
 		}
@@ -149,9 +185,8 @@ func (rc *rackControl) detectReorder(seg *segment) {
 	}
 }
 
-// setDSACKSeen updates rack control if duplicate SACK is seen by the connection.
-func (rc *rackControl) setDSACKSeen() {
-	rc.dsackSeen = true
+func (rc *rackControl) setDSACKSeen(dsackSeen bool) {
+	rc.dsackSeen = dsackSeen
 }
 
 // shouldSchedulePTO dictates whether we should schedule a PTO or not.
@@ -272,3 +307,82 @@ func (s *sender) detectTLPRecovery(ack seqnum.Value, rcvdSeg *segment) {
 		}
 	}
 }
+
+// updateRACKReorderWindow updates the reorder window.
+// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
+// * Step 4: Update RACK reordering window
+//   To handle the prevalent small degree of reordering, RACK.reo_wnd serves as
+//   an allowance for settling time before marking a packet lost. RACK starts
+//   initially with a conservative window of min_RTT/4. If no reordering has
+//   been observed RACK uses reo_wnd of zero during loss recovery, in order to
+//   retransmit quickly, or when the number of DUPACKs exceeds the classic
+//   DUPACKthreshold.
+func (rc *rackControl) updateRACKReorderWindow(ackSeg *segment) {
+	dsackSeen := rc.dsackSeen
+	snd := rc.snd
+
+	// React to DSACK once per round trip.
+	// If SND.UNA < RACK.rtt_seq:
+	//   RACK.dsack = false
+	if snd.sndUna.LessThan(rc.rttSeq) {
+		dsackSeen = false
+	}
+
+	// If RACK.dsack:
+	//   RACK.reo_wnd_incr += 1
+	//   RACK.dsack = false
+	//   RACK.rtt_seq = SND.NXT
+	//   RACK.reo_wnd_persist = 16
+	if dsackSeen {
+		rc.reoWndIncr++
+		dsackSeen = false
+		rc.rttSeq = snd.sndNxt
+		rc.reoWndPersist = tcpRACKRecoveryThreshold
+	} else if rc.exitedRecovery {
+		// Else if exiting loss recovery:
+		//   RACK.reo_wnd_persist -= 1
+		//   If RACK.reo_wnd_persist <= 0:
+		//      RACK.reo_wnd_incr = 1
+		rc.reoWndPersist--
+		if rc.reoWndPersist <= 0 {
+			rc.reoWndIncr = 1
+		}
+		rc.exitedRecovery = false
+	}
+
+	// Reorder window is zero during loss recovery, or when the number of
+	// DUPACKs exceeds the classic DUPACKthreshold.
+	// If RACK.reord is FALSE:
+	//   If in loss recovery:  (If in fast or timeout recovery)
+	//      RACK.reo_wnd = 0
+	//      Return
+	//   Else if RACK.pkts_sacked >= RACK.dupthresh:
+	//     RACK.reo_wnd = 0
+	//     return
+	if !rc.reorderSeen {
+		if snd.state == tcpip.RTORecovery || snd.state == tcpip.SACKRecovery {
+			rc.reoWnd = 0
+			return
+		}
+
+		if snd.sackedOut >= nDupAckThreshold {
+			rc.reoWnd = 0
+			return
+		}
+	}
+
+	// Calculate reorder window.
+	// RACK.reo_wnd = RACK.min_RTT / 4 * RACK.reo_wnd_incr
+	// RACK.reo_wnd = min(RACK.reo_wnd, SRTT)
+	snd.rtt.Lock()
+	srtt := snd.rtt.srtt
+	snd.rtt.Unlock()
+	rc.reoWnd = time.Duration((int64(rc.minRTT) / 4) * int64(rc.reoWndIncr))
+	if srtt < rc.reoWnd {
+		rc.reoWnd = srtt
+	}
+}
+
+func (rc *rackControl) exitRecovery() {
+	rc.exitedRecovery = true
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 2967c9f97..dfc8fd248 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -258,14 +258,9 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 			highRxt:   iss,
 			rescueRxt: iss,
 		},
-		rc: rackControl{
-			fack: iss,
-		},
 		gso: ep.gso != nil,
 	}
 
-	s.rc.init()
-
 	if s.gso {
 		s.ep.gso.MSS = uint16(maxPayloadSize)
 	}
@@ -273,6 +268,7 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 	s.cc = s.initCongestionControl(ep.cc)
 
 	s.lr = s.initLossRecovery()
+	s.rc.init(s, iss)
 
 	// A negative sndWndScale means that no scaling is in use, otherwise we
 	// store the scaling value.
@@ -1058,7 +1054,6 @@ func (s *sender) leaveRecovery() {
 
 	// Deflate cwnd. It had been artificially inflated when new dups arrived.
 	s.sndCwnd = s.sndSsthresh
-
 	s.cc.PostRecovery()
 }
 
@@ -1195,11 +1190,13 @@ func (s *sender) isDupAck(seg *segment) bool {
 // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
 // steps 2 and 3.
 func (s *sender) walkSACK(rcvdSeg *segment) {
+	s.rc.setDSACKSeen(false)
+
 	// Look for DSACK block.
 	idx := 0
 	n := len(rcvdSeg.parsedOptions.SACKBlocks)
 	if checkDSACK(rcvdSeg) {
-		s.rc.setDSACKSeen()
+		s.rc.setDSACKSeen(true)
 		idx = 1
 		n--
 	}
@@ -1220,7 +1217,7 @@ func (s *sender) walkSACK(rcvdSeg *segment) {
 	for _, sb := range sackBlocks {
 		for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
 			if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
-				s.rc.update(seg, rcvdSeg, s.ep.tsOffset)
+				s.rc.update(seg, rcvdSeg)
 				s.rc.detectReorder(seg)
 				seg.acked = true
 				s.sackedOut += s.pCount(seg, s.maxPayloadSize)
@@ -1424,7 +1421,7 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 
 			// Update the RACK fields if SACK is enabled.
 			if s.ep.sackPermitted && !seg.acked {
-				s.rc.update(seg, rcvdSeg, s.ep.tsOffset)
+				s.rc.update(seg, rcvdSeg)
 				s.rc.detectReorder(seg)
 			}
 
@@ -1454,6 +1451,10 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 			s.cc.Update(originalOutstanding - s.outstanding)
 			if s.fr.last.LessThan(s.sndUna) {
 				s.state = tcpip.Open
+				// Update RACK when we are exiting fast or RTO
+				// recovery as described in the RFC
+				// draft-ietf-tcpm-rack-08 Section-7.2 Step 4.
+				s.rc.exitRecovery()
 			}
 		}
 
@@ -1477,6 +1478,12 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 		}
 	}
 
+	// Update RACK reorder window.
+	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
+	// * Upon receiving an ACK:
+	// * Step 4: Update RACK reordering window
+	s.rc.updateRACKReorderWindow(rcvdSeg)
+
 	// Now that we've popped all acknowledged data from the retransmit
 	// queue, retransmit if needed.
 	if s.fr.active {
diff --git a/pkg/tcpip/transport/tcp/tcp_rack_test.go b/pkg/tcpip/transport/tcp/tcp_rack_test.go
index af915203b..a6a26b705 100644
--- a/pkg/tcpip/transport/tcp/tcp_rack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_rack_test.go
@@ -16,6 +16,7 @@ package tcp_test
 
 import (
 	"bytes"
+	"fmt"
 	"testing"
 	"time"
 
@@ -534,3 +535,64 @@ func TestRACKWithInvalidDSACKBlock(t *testing.T) {
 	// ACK before the test completes.
 	<-probeDone
 }
+
+func addReorderWindowCheckerProbe(c *context.Context, numACK int, probeDone chan error) {
+	var n int
+	c.Stack().AddTCPProbe(func(state stack.TCPEndpointState) {
+		// Validate that RACK detects DSACK.
+		n++
+		if n < numACK {
+			return
+		}
+
+		if state.Sender.RACKState.ReoWnd == 0 || state.Sender.RACKState.ReoWnd > state.Sender.SRTT {
+			probeDone <- fmt.Errorf("got RACKState.ReoWnd: %v, expected it to be greater than 0 and less than %v", state.Sender.RACKState.ReoWnd, state.Sender.SRTT)
+			return
+		}
+
+		if state.Sender.RACKState.ReoWndIncr != 1 {
+			probeDone <- fmt.Errorf("got RACKState.ReoWndIncr: %v, want: 1", state.Sender.RACKState.ReoWndIncr)
+			return
+		}
+
+		if state.Sender.RACKState.ReoWndPersist > 0 {
+			probeDone <- fmt.Errorf("got RACKState.ReoWndPersist: %v, want: greater than 0", state.Sender.RACKState.ReoWndPersist)
+			return
+		}
+		probeDone <- nil
+	})
+}
+
+func TestRACKCheckReorderWindow(t *testing.T) {
+	c := context.New(t, uint32(mtu))
+	defer c.Cleanup()
+
+	probeDone := make(chan error)
+	const ackNumToVerify = 3
+	addReorderWindowCheckerProbe(c, ackNumToVerify, probeDone)
+
+	const numPackets = 7
+	sendAndReceive(t, c, numPackets)
+
+	// Send ACK for #1 packet.
+	bytesRead := maxPayload
+	seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1)
+	c.SendAck(seq, bytesRead)
+
+	// Missing [2-6] packets and SACK #7 packet.
+	seq = seqnum.Value(context.TestInitialSequenceNumber).Add(1)
+	start := c.IRS.Add(1 + seqnum.Size(6*maxPayload))
+	end := start.Add(seqnum.Size(maxPayload))
+	c.SendAckWithSACK(seq, bytesRead, []header.SACKBlock{{start, end}})
+
+	// Received delayed packets [2-6] which indicates there is reordering
+	// in the connection.
+	bytesRead += 6 * maxPayload
+	c.SendAck(seq, bytesRead)
+
+	// Wait for the probe function to finish processing the ACK before the
+	// test completes.
+	if err := <-probeDone; err != nil {
+		t.Fatalf("unexpected values for RACK variables: %v", err)
+	}
+}