// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "time" "gvisor.dev/gvisor/pkg/sleep" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/seqnum" ) // wcDelayedACKTimeout is the recommended maximum delayed ACK timer value as // defined in https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5. // It stands for worst case delayed ACK timer (WCDelAckT). When FlightSize is // 1, PTO is inflated by WCDelAckT time to compensate for a potential long // delayed ACK timer at the receiver. const wcDelayedACKTimeout = 200 * time.Millisecond // RACK is a loss detection algorithm used in TCP to detect packet loss and // reordering using transmission timestamp of the packets instead of packet or // sequence counts. To use RACK, SACK should be enabled on the connection. // rackControl stores the rack related fields. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-6.1 // // +stateify savable type rackControl struct { // dsackSeen indicates if the connection has seen a DSACK. dsackSeen bool // endSequence is the ending TCP sequence number of rackControl.seg. endSequence seqnum.Value // fack is the highest selectively or cumulatively acknowledged // sequence. fack seqnum.Value // minRTT is the estimated minimum RTT of the connection. minRTT time.Duration // rtt is the RTT of the most recently delivered packet on the // connection (either cumulatively acknowledged or selectively // acknowledged) that was not marked invalid as a possible spurious // retransmission. rtt time.Duration // reorderSeen indicates if reordering has been detected on this // connection. reorderSeen bool // xmitTime is the latest transmission timestamp of rackControl.seg. xmitTime time.Time `state:".(unixTime)"` // probeTimer and probeWaker are used to schedule PTO for RACK TLP algorithm. probeTimer timer `state:"nosave"` probeWaker sleep.Waker `state:"nosave"` // tlpRxtOut indicates whether there is an unacknowledged // TLP retransmission. tlpRxtOut bool // tlpHighRxt the value of sender.sndNxt at the time of sending // a TLP retransmission. tlpHighRxt seqnum.Value } // init initializes RACK specific fields. func (rc *rackControl) init() { rc.probeTimer.init(&rc.probeWaker) } // update will update the RACK related fields when an ACK has been received. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 func (rc *rackControl) update(seg *segment, ackSeg *segment, offset uint32) { rtt := time.Now().Sub(seg.xmitTime) // If the ACK is for a retransmitted packet, do not update if it is a // spurious inference which is determined by below checks: // 1. When Timestamping option is available, if the TSVal is less than the // transmit time of the most recent retransmitted packet. // 2. When RTT calculated for the packet is less than the smoothed RTT // for the connection. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 // step 2 if seg.xmitCount > 1 { if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 { if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, offset) { return } } if rtt < rc.minRTT { return } } rc.rtt = rtt // The sender can either track a simple global minimum of all RTT // measurements from the connection, or a windowed min-filtered value // of recent RTT measurements. This implementation keeps track of the // simple global minimum of all RTTs for the connection. if rtt < rc.minRTT || rc.minRTT == 0 { rc.minRTT = rtt } // Update rc.xmitTime and rc.endSequence to the transmit time and // ending sequence number of the packet which has been acknowledged // most recently. endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) if rc.xmitTime.Before(seg.xmitTime) || (seg.xmitTime.Equal(rc.xmitTime) && rc.endSequence.LessThan(endSeq)) { rc.xmitTime = seg.xmitTime rc.endSequence = endSeq } } // detectReorder detects if packet reordering has been observed. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 // * Step 3: Detect data segment reordering. // To detect reordering, the sender looks for original data segments being // delivered out of order. To detect such cases, the sender tracks the // highest sequence selectively or cumulatively acknowledged in the RACK.fack // variable. The name "fack" stands for the most "Forward ACK" (this term is // adopted from [FACK]). If a never retransmitted segment that's below // RACK.fack is (selectively or cumulatively) acknowledged, it has been // delivered out of order. The sender sets RACK.reord to TRUE if such segment // is identified. func (rc *rackControl) detectReorder(seg *segment) { endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) if rc.fack.LessThan(endSeq) { rc.fack = endSeq return } if endSeq.LessThan(rc.fack) && seg.xmitCount == 1 { rc.reorderSeen = true } } // setDSACKSeen updates rack control if duplicate SACK is seen by the connection. func (rc *rackControl) setDSACKSeen() { rc.dsackSeen = true } // shouldSchedulePTO dictates whether we should schedule a PTO or not. // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. func (s *sender) shouldSchedulePTO() bool { // Schedule PTO only if RACK loss detection is enabled. return s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 && // The connection supports SACK. s.ep.sackPermitted && // The connection is not in loss recovery. (s.state != RTORecovery && s.state != SACKRecovery) && // The connection has no SACKed sequences in the SACK scoreboard. s.ep.scoreboard.Sacked() == 0 } // schedulePTO schedules the probe timeout as defined in // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. func (s *sender) schedulePTO() { pto := time.Second s.rtt.Lock() if s.rtt.srttInited && s.rtt.srtt > 0 { pto = s.rtt.srtt * 2 if s.outstanding == 1 { pto += wcDelayedACKTimeout } } s.rtt.Unlock() now := time.Now() if s.resendTimer.enabled() { if now.Add(pto).After(s.resendTimer.target) { pto = s.resendTimer.target.Sub(now) } s.resendTimer.disable() } s.rc.probeTimer.enable(pto) } // probeTimerExpired is the same as TLP_send_probe() as defined in // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.2. func (s *sender) probeTimerExpired() *tcpip.Error { if !s.rc.probeTimer.checkExpiration() { return nil } var dataSent bool if s.writeNext != nil && s.writeNext.xmitCount == 0 && s.outstanding < s.sndCwnd { dataSent = s.maybeSendSegment(s.writeNext, int(s.ep.scoreboard.SMSS()), s.sndUna.Add(s.sndWnd)) if dataSent { s.outstanding += s.pCount(s.writeNext, s.maxPayloadSize) s.writeNext = s.writeNext.Next() } } if !dataSent && !s.rc.tlpRxtOut { var highestSeqXmit *segment for highestSeqXmit = s.writeList.Front(); highestSeqXmit != nil; highestSeqXmit = highestSeqXmit.Next() { if highestSeqXmit.xmitCount == 0 { // Nothing in writeList is transmitted, no need to send a probe. highestSeqXmit = nil break } if highestSeqXmit.Next() == nil || highestSeqXmit.Next().xmitCount == 0 { // Either everything in writeList has been transmitted or the next // sequence has not been transmitted. Either way this is the highest // sequence segment that was transmitted. break } } if highestSeqXmit != nil { dataSent = s.maybeSendSegment(highestSeqXmit, int(s.ep.scoreboard.SMSS()), s.sndUna.Add(s.sndWnd)) if dataSent { s.rc.tlpRxtOut = true s.rc.tlpHighRxt = s.sndNxt } } } s.postXmit(dataSent) return nil } // detectTLPRecovery detects if recovery was accomplished by the loss probes // and updates TLP state accordingly. // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.3. func (s *sender) detectTLPRecovery(ack seqnum.Value, rcvdSeg *segment) { if !(s.ep.sackPermitted && s.rc.tlpRxtOut) { return } // Step 1. if s.isDupAck(rcvdSeg) && ack == s.rc.tlpHighRxt { var sbAboveTLPHighRxt bool for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { if s.rc.tlpHighRxt.LessThan(sb.End) { sbAboveTLPHighRxt = true break } } if !sbAboveTLPHighRxt { // TLP episode is complete. s.rc.tlpRxtOut = false } } if s.rc.tlpRxtOut && s.rc.tlpHighRxt.LessThanEq(ack) { // TLP episode is complete. s.rc.tlpRxtOut = false if !checkDSACK(rcvdSeg) { // Step 2. Either the original packet or the retransmission (in the // form of a probe) was lost. Invoke a congestion control response // equivalent to fast recovery. s.cc.HandleNDupAcks() s.enterRecovery() s.leaveRecovery() } } }