summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/tcpip/stack/stack.go16
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go16
-rw-r--r--pkg/tcpip/transport/tcp/rack.go150
-rw-r--r--pkg/tcpip/transport/tcp/snd.go25
-rw-r--r--pkg/tcpip/transport/tcp/tcp_rack_test.go62
5 files changed, 230 insertions, 39 deletions
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index e56183e71..119c4c505 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -76,12 +76,16 @@ type TCPCubicState struct {
// TCPRACKState is used to hold a copy of the internal RACK state when the
// TCPProbeFunc is invoked.
type TCPRACKState struct {
- XmitTime time.Time
- EndSequence seqnum.Value
- FACK seqnum.Value
- RTT time.Duration
- Reord bool
- DSACKSeen bool
+ XmitTime time.Time
+ EndSequence seqnum.Value
+ FACK seqnum.Value
+ RTT time.Duration
+ Reord bool
+ DSACKSeen bool
+ ReoWnd time.Duration
+ ReoWndIncr uint8
+ ReoWndPersist int8
+ RTTSeq seqnum.Value
}
// TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9a53f0d56..6e4e26c39 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -3031,12 +3031,16 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
rc := &e.snd.rc
s.Sender.RACKState = stack.TCPRACKState{
- XmitTime: rc.xmitTime,
- EndSequence: rc.endSequence,
- FACK: rc.fack,
- RTT: rc.rtt,
- Reord: rc.reorderSeen,
- DSACKSeen: rc.dsackSeen,
+ XmitTime: rc.xmitTime,
+ EndSequence: rc.endSequence,
+ FACK: rc.fack,
+ RTT: rc.rtt,
+ Reord: rc.reorderSeen,
+ DSACKSeen: rc.dsackSeen,
+ ReoWnd: rc.reoWnd,
+ ReoWndIncr: rc.reoWndIncr,
+ ReoWndPersist: rc.reoWndPersist,
+ RTTSeq: rc.rttSeq,
}
return s
}
diff --git a/pkg/tcpip/transport/tcp/rack.go b/pkg/tcpip/transport/tcp/rack.go
index c0578f2d6..d85cb405a 100644
--- a/pkg/tcpip/transport/tcp/rack.go
+++ b/pkg/tcpip/transport/tcp/rack.go
@@ -22,12 +22,21 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/seqnum"
)
-// wcDelayedACKTimeout is the recommended maximum delayed ACK timer value as
-// defined in https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.
-// It stands for worst case delayed ACK timer (WCDelAckT). When FlightSize is
-// 1, PTO is inflated by WCDelAckT time to compensate for a potential long
-// delayed ACK timer at the receiver.
-const wcDelayedACKTimeout = 200 * time.Millisecond
+const (
+ // wcDelayedACKTimeout is the recommended maximum delayed ACK timer
+ // value as defined in the RFC. It stands for worst case delayed ACK
+ // timer (WCDelAckT). When FlightSize is 1, PTO is inflated by
+ // WCDelAckT time to compensate for a potential long delayed ACK timer
+ // at the receiver.
+ // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.
+ wcDelayedACKTimeout = 200 * time.Millisecond
+
+ // tcpRACKRecoveryThreshold is the number of loss recoveries for which
+ // the reorder window is inflated and after that the reorder window is
+ // reset to its initial value of minRTT/4.
+ // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2.
+ tcpRACKRecoveryThreshold = 16
+)
// RACK is a loss detection algorithm used in TCP to detect packet loss and
// reordering using transmission timestamp of the packets instead of packet or
@@ -44,6 +53,11 @@ type rackControl struct {
// endSequence is the ending TCP sequence number of rackControl.seg.
endSequence seqnum.Value
+ // exitedRecovery indicates if the connection is exiting loss recovery.
+ // This flag is set if the sender is leaving the recovery after
+ // receiving an ACK and is reset during updating of reorder window.
+ exitedRecovery bool
+
// fack is the highest selectively or cumulatively acknowledged
// sequence.
fack seqnum.Value
@@ -51,15 +65,30 @@ type rackControl struct {
// minRTT is the estimated minimum RTT of the connection.
minRTT time.Duration
+ // reorderSeen indicates if reordering has been detected on this
+ // connection.
+ reorderSeen bool
+
+ // reoWnd is the reordering window time used for recording packet
+ // transmission times. It is used to defer the moment at which RACK
+ // marks a packet lost.
+ reoWnd time.Duration
+
+ // reoWndIncr is the multiplier applied to adjust reorder window.
+ reoWndIncr uint8
+
+ // reoWndPersist is the number of loss recoveries before resetting
+ // reorder window.
+ reoWndPersist int8
+
// rtt is the RTT of the most recently delivered packet on the
// connection (either cumulatively acknowledged or selectively
// acknowledged) that was not marked invalid as a possible spurious
// retransmission.
rtt time.Duration
- // reorderSeen indicates if reordering has been detected on this
- // connection.
- reorderSeen bool
+ // rttSeq is the SND.NXT when rtt is updated.
+ rttSeq seqnum.Value
// xmitTime is the latest transmission timestamp of rackControl.seg.
xmitTime time.Time `state:".(unixTime)"`
@@ -75,29 +104,36 @@ type rackControl struct {
// tlpHighRxt the value of sender.sndNxt at the time of sending
// a TLP retransmission.
tlpHighRxt seqnum.Value
+
+ // snd is a reference to the sender.
+ snd *sender
}
// init initializes RACK specific fields.
-func (rc *rackControl) init() {
+func (rc *rackControl) init(snd *sender, iss seqnum.Value) {
+ rc.fack = iss
+ rc.reoWndIncr = 1
+ rc.snd = snd
rc.probeTimer.init(&rc.probeWaker)
}
// update will update the RACK related fields when an ACK has been received.
-// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
-func (rc *rackControl) update(seg *segment, ackSeg *segment, offset uint32) {
+// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2
+func (rc *rackControl) update(seg *segment, ackSeg *segment) {
rtt := time.Now().Sub(seg.xmitTime)
+ tsOffset := rc.snd.ep.tsOffset
// If the ACK is for a retransmitted packet, do not update if it is a
// spurious inference which is determined by below checks:
- // 1. When Timestamping option is available, if the TSVal is less than the
- // transmit time of the most recent retransmitted packet.
+ // 1. When Timestamping option is available, if the TSVal is less than
+ // the transmit time of the most recent retransmitted packet.
// 2. When RTT calculated for the packet is less than the smoothed RTT
// for the connection.
// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
// step 2
if seg.xmitCount > 1 {
if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 {
- if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, offset) {
+ if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, tsOffset) {
return
}
}
@@ -149,9 +185,8 @@ func (rc *rackControl) detectReorder(seg *segment) {
}
}
-// setDSACKSeen updates rack control if duplicate SACK is seen by the connection.
-func (rc *rackControl) setDSACKSeen() {
- rc.dsackSeen = true
+func (rc *rackControl) setDSACKSeen(dsackSeen bool) {
+ rc.dsackSeen = dsackSeen
}
// shouldSchedulePTO dictates whether we should schedule a PTO or not.
@@ -272,3 +307,82 @@ func (s *sender) detectTLPRecovery(ack seqnum.Value, rcvdSeg *segment) {
}
}
}
+
+// updateRACKReorderWindow updates the reorder window.
+// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
+// * Step 4: Update RACK reordering window
+// To handle the prevalent small degree of reordering, RACK.reo_wnd serves as
+// an allowance for settling time before marking a packet lost. RACK starts
+// initially with a conservative window of min_RTT/4. If no reordering has
+// been observed RACK uses reo_wnd of zero during loss recovery, in order to
+// retransmit quickly, or when the number of DUPACKs exceeds the classic
+// DUPACKthreshold.
+func (rc *rackControl) updateRACKReorderWindow(ackSeg *segment) {
+ dsackSeen := rc.dsackSeen
+ snd := rc.snd
+
+ // React to DSACK once per round trip.
+ // If SND.UNA < RACK.rtt_seq:
+ // RACK.dsack = false
+ if snd.sndUna.LessThan(rc.rttSeq) {
+ dsackSeen = false
+ }
+
+ // If RACK.dsack:
+ // RACK.reo_wnd_incr += 1
+ // RACK.dsack = false
+ // RACK.rtt_seq = SND.NXT
+ // RACK.reo_wnd_persist = 16
+ if dsackSeen {
+ rc.reoWndIncr++
+ dsackSeen = false
+ rc.rttSeq = snd.sndNxt
+ rc.reoWndPersist = tcpRACKRecoveryThreshold
+ } else if rc.exitedRecovery {
+ // Else if exiting loss recovery:
+ // RACK.reo_wnd_persist -= 1
+ // If RACK.reo_wnd_persist <= 0:
+ // RACK.reo_wnd_incr = 1
+ rc.reoWndPersist--
+ if rc.reoWndPersist <= 0 {
+ rc.reoWndIncr = 1
+ }
+ rc.exitedRecovery = false
+ }
+
+ // Reorder window is zero during loss recovery, or when the number of
+ // DUPACKs exceeds the classic DUPACKthreshold.
+ // If RACK.reord is FALSE:
+ // If in loss recovery: (If in fast or timeout recovery)
+ // RACK.reo_wnd = 0
+ // Return
+ // Else if RACK.pkts_sacked >= RACK.dupthresh:
+ // RACK.reo_wnd = 0
+ // return
+ if !rc.reorderSeen {
+ if snd.state == tcpip.RTORecovery || snd.state == tcpip.SACKRecovery {
+ rc.reoWnd = 0
+ return
+ }
+
+ if snd.sackedOut >= nDupAckThreshold {
+ rc.reoWnd = 0
+ return
+ }
+ }
+
+ // Calculate reorder window.
+ // RACK.reo_wnd = RACK.min_RTT / 4 * RACK.reo_wnd_incr
+ // RACK.reo_wnd = min(RACK.reo_wnd, SRTT)
+ snd.rtt.Lock()
+ srtt := snd.rtt.srtt
+ snd.rtt.Unlock()
+ rc.reoWnd = time.Duration((int64(rc.minRTT) / 4) * int64(rc.reoWndIncr))
+ if srtt < rc.reoWnd {
+ rc.reoWnd = srtt
+ }
+}
+
+func (rc *rackControl) exitRecovery() {
+ rc.exitedRecovery = true
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 2967c9f97..dfc8fd248 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -258,14 +258,9 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
highRxt: iss,
rescueRxt: iss,
},
- rc: rackControl{
- fack: iss,
- },
gso: ep.gso != nil,
}
- s.rc.init()
-
if s.gso {
s.ep.gso.MSS = uint16(maxPayloadSize)
}
@@ -273,6 +268,7 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
s.cc = s.initCongestionControl(ep.cc)
s.lr = s.initLossRecovery()
+ s.rc.init(s, iss)
// A negative sndWndScale means that no scaling is in use, otherwise we
// store the scaling value.
@@ -1058,7 +1054,6 @@ func (s *sender) leaveRecovery() {
// Deflate cwnd. It had been artificially inflated when new dups arrived.
s.sndCwnd = s.sndSsthresh
-
s.cc.PostRecovery()
}
@@ -1195,11 +1190,13 @@ func (s *sender) isDupAck(seg *segment) bool {
// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
// steps 2 and 3.
func (s *sender) walkSACK(rcvdSeg *segment) {
+ s.rc.setDSACKSeen(false)
+
// Look for DSACK block.
idx := 0
n := len(rcvdSeg.parsedOptions.SACKBlocks)
if checkDSACK(rcvdSeg) {
- s.rc.setDSACKSeen()
+ s.rc.setDSACKSeen(true)
idx = 1
n--
}
@@ -1220,7 +1217,7 @@ func (s *sender) walkSACK(rcvdSeg *segment) {
for _, sb := range sackBlocks {
for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
- s.rc.update(seg, rcvdSeg, s.ep.tsOffset)
+ s.rc.update(seg, rcvdSeg)
s.rc.detectReorder(seg)
seg.acked = true
s.sackedOut += s.pCount(seg, s.maxPayloadSize)
@@ -1424,7 +1421,7 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
// Update the RACK fields if SACK is enabled.
if s.ep.sackPermitted && !seg.acked {
- s.rc.update(seg, rcvdSeg, s.ep.tsOffset)
+ s.rc.update(seg, rcvdSeg)
s.rc.detectReorder(seg)
}
@@ -1454,6 +1451,10 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
s.cc.Update(originalOutstanding - s.outstanding)
if s.fr.last.LessThan(s.sndUna) {
s.state = tcpip.Open
+ // Update RACK when we are exiting fast or RTO
+ // recovery as described in the RFC
+ // draft-ietf-tcpm-rack-08 Section-7.2 Step 4.
+ s.rc.exitRecovery()
}
}
@@ -1477,6 +1478,12 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
}
}
+ // Update RACK reorder window.
+ // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
+ // * Upon receiving an ACK:
+ // * Step 4: Update RACK reordering window
+ s.rc.updateRACKReorderWindow(rcvdSeg)
+
// Now that we've popped all acknowledged data from the retransmit
// queue, retransmit if needed.
if s.fr.active {
diff --git a/pkg/tcpip/transport/tcp/tcp_rack_test.go b/pkg/tcpip/transport/tcp/tcp_rack_test.go
index af915203b..a6a26b705 100644
--- a/pkg/tcpip/transport/tcp/tcp_rack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_rack_test.go
@@ -16,6 +16,7 @@ package tcp_test
import (
"bytes"
+ "fmt"
"testing"
"time"
@@ -534,3 +535,64 @@ func TestRACKWithInvalidDSACKBlock(t *testing.T) {
// ACK before the test completes.
<-probeDone
}
+
+func addReorderWindowCheckerProbe(c *context.Context, numACK int, probeDone chan error) {
+ var n int
+ c.Stack().AddTCPProbe(func(state stack.TCPEndpointState) {
+ // Validate that RACK detects DSACK.
+ n++
+ if n < numACK {
+ return
+ }
+
+ if state.Sender.RACKState.ReoWnd == 0 || state.Sender.RACKState.ReoWnd > state.Sender.SRTT {
+ probeDone <- fmt.Errorf("got RACKState.ReoWnd: %v, expected it to be greater than 0 and less than %v", state.Sender.RACKState.ReoWnd, state.Sender.SRTT)
+ return
+ }
+
+ if state.Sender.RACKState.ReoWndIncr != 1 {
+ probeDone <- fmt.Errorf("got RACKState.ReoWndIncr: %v, want: 1", state.Sender.RACKState.ReoWndIncr)
+ return
+ }
+
+ if state.Sender.RACKState.ReoWndPersist > 0 {
+ probeDone <- fmt.Errorf("got RACKState.ReoWndPersist: %v, want: greater than 0", state.Sender.RACKState.ReoWndPersist)
+ return
+ }
+ probeDone <- nil
+ })
+}
+
+func TestRACKCheckReorderWindow(t *testing.T) {
+ c := context.New(t, uint32(mtu))
+ defer c.Cleanup()
+
+ probeDone := make(chan error)
+ const ackNumToVerify = 3
+ addReorderWindowCheckerProbe(c, ackNumToVerify, probeDone)
+
+ const numPackets = 7
+ sendAndReceive(t, c, numPackets)
+
+ // Send ACK for #1 packet.
+ bytesRead := maxPayload
+ seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1)
+ c.SendAck(seq, bytesRead)
+
+ // Missing [2-6] packets and SACK #7 packet.
+ seq = seqnum.Value(context.TestInitialSequenceNumber).Add(1)
+ start := c.IRS.Add(1 + seqnum.Size(6*maxPayload))
+ end := start.Add(seqnum.Size(maxPayload))
+ c.SendAckWithSACK(seq, bytesRead, []header.SACKBlock{{start, end}})
+
+ // Received delayed packets [2-6] which indicates there is reordering
+ // in the connection.
+ bytesRead += 6 * maxPayload
+ c.SendAck(seq, bytesRead)
+
+ // Wait for the probe function to finish processing the ACK before the
+ // test completes.
+ if err := <-probeDone; err != nil {
+ t.Fatalf("unexpected values for RACK variables: %v", err)
+ }
+}