diff options
-rw-r--r-- | pkg/tcpip/stack/stack.go | 16 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint.go | 16 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/rack.go | 150 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/snd.go | 25 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/tcp_rack_test.go | 62 |
5 files changed, 230 insertions, 39 deletions
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index e56183e71..119c4c505 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -76,12 +76,16 @@ type TCPCubicState struct { // TCPRACKState is used to hold a copy of the internal RACK state when the // TCPProbeFunc is invoked. type TCPRACKState struct { - XmitTime time.Time - EndSequence seqnum.Value - FACK seqnum.Value - RTT time.Duration - Reord bool - DSACKSeen bool + XmitTime time.Time + EndSequence seqnum.Value + FACK seqnum.Value + RTT time.Duration + Reord bool + DSACKSeen bool + ReoWnd time.Duration + ReoWndIncr uint8 + ReoWndPersist int8 + RTTSeq seqnum.Value } // TCPEndpointID is the unique 4 tuple that identifies a given endpoint. diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 9a53f0d56..6e4e26c39 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -3031,12 +3031,16 @@ func (e *endpoint) completeState() stack.TCPEndpointState { rc := &e.snd.rc s.Sender.RACKState = stack.TCPRACKState{ - XmitTime: rc.xmitTime, - EndSequence: rc.endSequence, - FACK: rc.fack, - RTT: rc.rtt, - Reord: rc.reorderSeen, - DSACKSeen: rc.dsackSeen, + XmitTime: rc.xmitTime, + EndSequence: rc.endSequence, + FACK: rc.fack, + RTT: rc.rtt, + Reord: rc.reorderSeen, + DSACKSeen: rc.dsackSeen, + ReoWnd: rc.reoWnd, + ReoWndIncr: rc.reoWndIncr, + ReoWndPersist: rc.reoWndPersist, + RTTSeq: rc.rttSeq, } return s } diff --git a/pkg/tcpip/transport/tcp/rack.go b/pkg/tcpip/transport/tcp/rack.go index c0578f2d6..d85cb405a 100644 --- a/pkg/tcpip/transport/tcp/rack.go +++ b/pkg/tcpip/transport/tcp/rack.go @@ -22,12 +22,21 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/seqnum" ) -// wcDelayedACKTimeout is the recommended maximum delayed ACK timer value as -// defined in https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5. -// It stands for worst case delayed ACK timer (WCDelAckT). When FlightSize is -// 1, PTO is inflated by WCDelAckT time to compensate for a potential long -// delayed ACK timer at the receiver. -const wcDelayedACKTimeout = 200 * time.Millisecond +const ( + // wcDelayedACKTimeout is the recommended maximum delayed ACK timer + // value as defined in the RFC. It stands for worst case delayed ACK + // timer (WCDelAckT). When FlightSize is 1, PTO is inflated by + // WCDelAckT time to compensate for a potential long delayed ACK timer + // at the receiver. + // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5. + wcDelayedACKTimeout = 200 * time.Millisecond + + // tcpRACKRecoveryThreshold is the number of loss recoveries for which + // the reorder window is inflated and after that the reorder window is + // reset to its initial value of minRTT/4. + // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2. + tcpRACKRecoveryThreshold = 16 +) // RACK is a loss detection algorithm used in TCP to detect packet loss and // reordering using transmission timestamp of the packets instead of packet or @@ -44,6 +53,11 @@ type rackControl struct { // endSequence is the ending TCP sequence number of rackControl.seg. endSequence seqnum.Value + // exitedRecovery indicates if the connection is exiting loss recovery. + // This flag is set if the sender is leaving the recovery after + // receiving an ACK and is reset during updating of reorder window. + exitedRecovery bool + // fack is the highest selectively or cumulatively acknowledged // sequence. fack seqnum.Value @@ -51,15 +65,30 @@ type rackControl struct { // minRTT is the estimated minimum RTT of the connection. minRTT time.Duration + // reorderSeen indicates if reordering has been detected on this + // connection. + reorderSeen bool + + // reoWnd is the reordering window time used for recording packet + // transmission times. It is used to defer the moment at which RACK + // marks a packet lost. + reoWnd time.Duration + + // reoWndIncr is the multiplier applied to adjust reorder window. + reoWndIncr uint8 + + // reoWndPersist is the number of loss recoveries before resetting + // reorder window. + reoWndPersist int8 + // rtt is the RTT of the most recently delivered packet on the // connection (either cumulatively acknowledged or selectively // acknowledged) that was not marked invalid as a possible spurious // retransmission. rtt time.Duration - // reorderSeen indicates if reordering has been detected on this - // connection. - reorderSeen bool + // rttSeq is the SND.NXT when rtt is updated. + rttSeq seqnum.Value // xmitTime is the latest transmission timestamp of rackControl.seg. xmitTime time.Time `state:".(unixTime)"` @@ -75,29 +104,36 @@ type rackControl struct { // tlpHighRxt the value of sender.sndNxt at the time of sending // a TLP retransmission. tlpHighRxt seqnum.Value + + // snd is a reference to the sender. + snd *sender } // init initializes RACK specific fields. -func (rc *rackControl) init() { +func (rc *rackControl) init(snd *sender, iss seqnum.Value) { + rc.fack = iss + rc.reoWndIncr = 1 + rc.snd = snd rc.probeTimer.init(&rc.probeWaker) } // update will update the RACK related fields when an ACK has been received. -// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 -func (rc *rackControl) update(seg *segment, ackSeg *segment, offset uint32) { +// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2 +func (rc *rackControl) update(seg *segment, ackSeg *segment) { rtt := time.Now().Sub(seg.xmitTime) + tsOffset := rc.snd.ep.tsOffset // If the ACK is for a retransmitted packet, do not update if it is a // spurious inference which is determined by below checks: - // 1. When Timestamping option is available, if the TSVal is less than the - // transmit time of the most recent retransmitted packet. + // 1. When Timestamping option is available, if the TSVal is less than + // the transmit time of the most recent retransmitted packet. // 2. When RTT calculated for the packet is less than the smoothed RTT // for the connection. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 // step 2 if seg.xmitCount > 1 { if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 { - if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, offset) { + if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, tsOffset) { return } } @@ -149,9 +185,8 @@ func (rc *rackControl) detectReorder(seg *segment) { } } -// setDSACKSeen updates rack control if duplicate SACK is seen by the connection. -func (rc *rackControl) setDSACKSeen() { - rc.dsackSeen = true +func (rc *rackControl) setDSACKSeen(dsackSeen bool) { + rc.dsackSeen = dsackSeen } // shouldSchedulePTO dictates whether we should schedule a PTO or not. @@ -272,3 +307,82 @@ func (s *sender) detectTLPRecovery(ack seqnum.Value, rcvdSeg *segment) { } } } + +// updateRACKReorderWindow updates the reorder window. +// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 +// * Step 4: Update RACK reordering window +// To handle the prevalent small degree of reordering, RACK.reo_wnd serves as +// an allowance for settling time before marking a packet lost. RACK starts +// initially with a conservative window of min_RTT/4. If no reordering has +// been observed RACK uses reo_wnd of zero during loss recovery, in order to +// retransmit quickly, or when the number of DUPACKs exceeds the classic +// DUPACKthreshold. +func (rc *rackControl) updateRACKReorderWindow(ackSeg *segment) { + dsackSeen := rc.dsackSeen + snd := rc.snd + + // React to DSACK once per round trip. + // If SND.UNA < RACK.rtt_seq: + // RACK.dsack = false + if snd.sndUna.LessThan(rc.rttSeq) { + dsackSeen = false + } + + // If RACK.dsack: + // RACK.reo_wnd_incr += 1 + // RACK.dsack = false + // RACK.rtt_seq = SND.NXT + // RACK.reo_wnd_persist = 16 + if dsackSeen { + rc.reoWndIncr++ + dsackSeen = false + rc.rttSeq = snd.sndNxt + rc.reoWndPersist = tcpRACKRecoveryThreshold + } else if rc.exitedRecovery { + // Else if exiting loss recovery: + // RACK.reo_wnd_persist -= 1 + // If RACK.reo_wnd_persist <= 0: + // RACK.reo_wnd_incr = 1 + rc.reoWndPersist-- + if rc.reoWndPersist <= 0 { + rc.reoWndIncr = 1 + } + rc.exitedRecovery = false + } + + // Reorder window is zero during loss recovery, or when the number of + // DUPACKs exceeds the classic DUPACKthreshold. + // If RACK.reord is FALSE: + // If in loss recovery: (If in fast or timeout recovery) + // RACK.reo_wnd = 0 + // Return + // Else if RACK.pkts_sacked >= RACK.dupthresh: + // RACK.reo_wnd = 0 + // return + if !rc.reorderSeen { + if snd.state == tcpip.RTORecovery || snd.state == tcpip.SACKRecovery { + rc.reoWnd = 0 + return + } + + if snd.sackedOut >= nDupAckThreshold { + rc.reoWnd = 0 + return + } + } + + // Calculate reorder window. + // RACK.reo_wnd = RACK.min_RTT / 4 * RACK.reo_wnd_incr + // RACK.reo_wnd = min(RACK.reo_wnd, SRTT) + snd.rtt.Lock() + srtt := snd.rtt.srtt + snd.rtt.Unlock() + rc.reoWnd = time.Duration((int64(rc.minRTT) / 4) * int64(rc.reoWndIncr)) + if srtt < rc.reoWnd { + rc.reoWnd = srtt + } +} + +func (rc *rackControl) exitRecovery() { + rc.exitedRecovery = true +} diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 2967c9f97..dfc8fd248 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -258,14 +258,9 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint highRxt: iss, rescueRxt: iss, }, - rc: rackControl{ - fack: iss, - }, gso: ep.gso != nil, } - s.rc.init() - if s.gso { s.ep.gso.MSS = uint16(maxPayloadSize) } @@ -273,6 +268,7 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint s.cc = s.initCongestionControl(ep.cc) s.lr = s.initLossRecovery() + s.rc.init(s, iss) // A negative sndWndScale means that no scaling is in use, otherwise we // store the scaling value. @@ -1058,7 +1054,6 @@ func (s *sender) leaveRecovery() { // Deflate cwnd. It had been artificially inflated when new dups arrived. s.sndCwnd = s.sndSsthresh - s.cc.PostRecovery() } @@ -1195,11 +1190,13 @@ func (s *sender) isDupAck(seg *segment) bool { // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 // steps 2 and 3. func (s *sender) walkSACK(rcvdSeg *segment) { + s.rc.setDSACKSeen(false) + // Look for DSACK block. idx := 0 n := len(rcvdSeg.parsedOptions.SACKBlocks) if checkDSACK(rcvdSeg) { - s.rc.setDSACKSeen() + s.rc.setDSACKSeen(true) idx = 1 n-- } @@ -1220,7 +1217,7 @@ func (s *sender) walkSACK(rcvdSeg *segment) { for _, sb := range sackBlocks { for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 { if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked { - s.rc.update(seg, rcvdSeg, s.ep.tsOffset) + s.rc.update(seg, rcvdSeg) s.rc.detectReorder(seg) seg.acked = true s.sackedOut += s.pCount(seg, s.maxPayloadSize) @@ -1424,7 +1421,7 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { // Update the RACK fields if SACK is enabled. if s.ep.sackPermitted && !seg.acked { - s.rc.update(seg, rcvdSeg, s.ep.tsOffset) + s.rc.update(seg, rcvdSeg) s.rc.detectReorder(seg) } @@ -1454,6 +1451,10 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { s.cc.Update(originalOutstanding - s.outstanding) if s.fr.last.LessThan(s.sndUna) { s.state = tcpip.Open + // Update RACK when we are exiting fast or RTO + // recovery as described in the RFC + // draft-ietf-tcpm-rack-08 Section-7.2 Step 4. + s.rc.exitRecovery() } } @@ -1477,6 +1478,12 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { } } + // Update RACK reorder window. + // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 + // * Upon receiving an ACK: + // * Step 4: Update RACK reordering window + s.rc.updateRACKReorderWindow(rcvdSeg) + // Now that we've popped all acknowledged data from the retransmit // queue, retransmit if needed. if s.fr.active { diff --git a/pkg/tcpip/transport/tcp/tcp_rack_test.go b/pkg/tcpip/transport/tcp/tcp_rack_test.go index af915203b..a6a26b705 100644 --- a/pkg/tcpip/transport/tcp/tcp_rack_test.go +++ b/pkg/tcpip/transport/tcp/tcp_rack_test.go @@ -16,6 +16,7 @@ package tcp_test import ( "bytes" + "fmt" "testing" "time" @@ -534,3 +535,64 @@ func TestRACKWithInvalidDSACKBlock(t *testing.T) { // ACK before the test completes. <-probeDone } + +func addReorderWindowCheckerProbe(c *context.Context, numACK int, probeDone chan error) { + var n int + c.Stack().AddTCPProbe(func(state stack.TCPEndpointState) { + // Validate that RACK detects DSACK. + n++ + if n < numACK { + return + } + + if state.Sender.RACKState.ReoWnd == 0 || state.Sender.RACKState.ReoWnd > state.Sender.SRTT { + probeDone <- fmt.Errorf("got RACKState.ReoWnd: %v, expected it to be greater than 0 and less than %v", state.Sender.RACKState.ReoWnd, state.Sender.SRTT) + return + } + + if state.Sender.RACKState.ReoWndIncr != 1 { + probeDone <- fmt.Errorf("got RACKState.ReoWndIncr: %v, want: 1", state.Sender.RACKState.ReoWndIncr) + return + } + + if state.Sender.RACKState.ReoWndPersist > 0 { + probeDone <- fmt.Errorf("got RACKState.ReoWndPersist: %v, want: greater than 0", state.Sender.RACKState.ReoWndPersist) + return + } + probeDone <- nil + }) +} + +func TestRACKCheckReorderWindow(t *testing.T) { + c := context.New(t, uint32(mtu)) + defer c.Cleanup() + + probeDone := make(chan error) + const ackNumToVerify = 3 + addReorderWindowCheckerProbe(c, ackNumToVerify, probeDone) + + const numPackets = 7 + sendAndReceive(t, c, numPackets) + + // Send ACK for #1 packet. + bytesRead := maxPayload + seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1) + c.SendAck(seq, bytesRead) + + // Missing [2-6] packets and SACK #7 packet. + seq = seqnum.Value(context.TestInitialSequenceNumber).Add(1) + start := c.IRS.Add(1 + seqnum.Size(6*maxPayload)) + end := start.Add(seqnum.Size(maxPayload)) + c.SendAckWithSACK(seq, bytesRead, []header.SACKBlock{{start, end}}) + + // Received delayed packets [2-6] which indicates there is reordering + // in the connection. + bytesRead += 6 * maxPayload + c.SendAck(seq, bytesRead) + + // Wait for the probe function to finish processing the ACK before the + // test completes. + if err := <-probeDone; err != nil { + t.Fatalf("unexpected values for RACK variables: %v", err) + } +} |