diff options
Diffstat (limited to 'pkg/tcpip/transport/tcp/snd.go')
-rw-r--r-- | pkg/tcpip/transport/tcp/snd.go | 512 |
1 files changed, 390 insertions, 122 deletions
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index d3f7c9125..c55589c45 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -15,12 +15,13 @@ package tcp import ( + "fmt" "math" - "sync" "sync/atomic" "time" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -28,8 +29,11 @@ import ( ) const ( - // minRTO is the minimum allowed value for the retransmit timeout. - minRTO = 200 * time.Millisecond + // MinRTO is the minimum allowed value for the retransmit timeout. + MinRTO = 200 * time.Millisecond + + // MaxRTO is the maximum allowed value for the retransmit timeout. + MaxRTO = 120 * time.Second // InitialCwnd is the initial congestion window. InitialCwnd = 10 @@ -37,6 +41,11 @@ const ( // nDupAckThreshold is the number of duplicate ACK's required // before fast-retransmit is entered. nDupAckThreshold = 3 + + // MaxRetries is the maximum number of probe retries sender does + // before timing out the connection. + // Linux default TCP_RETR2, net.ipv4.tcp_retries2. + MaxRetries = 15 ) // ccState indicates the current congestion control state for this sender. @@ -123,10 +132,6 @@ type sender struct { // sndNxt is the sequence number of the next segment to be sent. sndNxt seqnum.Value - // sndNxtList is the sequence number of the next segment to be added to - // the send list. - sndNxtList seqnum.Value - // rttMeasureSeqNum is the sequence number being used for the latest RTT // measurement. rttMeasureSeqNum seqnum.Value @@ -134,6 +139,18 @@ type sender struct { // rttMeasureTime is the time when the rttMeasureSeqNum was sent. rttMeasureTime time.Time `state:".(unixTime)"` + // firstRetransmittedSegXmitTime is the original transmit time of + // the first segment that was retransmitted due to RTO expiration. + firstRetransmittedSegXmitTime time.Time `state:".(unixTime)"` + + // zeroWindowProbing is set if the sender is currently probing + // for zero receive window. + zeroWindowProbing bool `state:"nosave"` + + // unackZeroWindowProbes is the number of unacknowledged zero + // window probes. + unackZeroWindowProbes uint32 `state:"nosave"` + closed bool writeNext *segment writeList segmentList @@ -146,6 +163,15 @@ type sender struct { rtt rtt rto time.Duration + // minRTO is the minimum permitted value for sender.rto. + minRTO time.Duration + + // maxRTO is the maximum permitted value for sender.rto. + maxRTO time.Duration + + // maxRetries is the maximum permitted retransmissions. + maxRetries uint32 + // maxPayloadSize is the maximum size of the payload of a given segment. // It is initialized on demand. maxPayloadSize int @@ -165,6 +191,10 @@ type sender struct { // cc is the congestion control algorithm in use for this sender. cc congestionControl + + // rc has the fields needed for implementing RACK loss detection + // algorithm. + rc rackControl } // rtt is a synchronization wrapper used to appease stateify. See the comment @@ -222,7 +252,6 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint sndWnd: sndWnd, sndUna: iss + 1, sndNxt: iss + 1, - sndNxtList: iss + 1, rto: 1 * time.Second, rttMeasureSeqNum: iss + 1, lastSendTime: time.Now(), @@ -258,6 +287,25 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint // etc. s.ep.scoreboard = NewSACKScoreboard(uint16(s.maxPayloadSize), iss) + // Get Stack wide config. + var minRTO tcpip.TCPMinRTOOption + if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil { + panic(fmt.Sprintf("unable to get minRTO from stack: %s", err)) + } + s.minRTO = time.Duration(minRTO) + + var maxRTO tcpip.TCPMaxRTOOption + if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil { + panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err)) + } + s.maxRTO = time.Duration(maxRTO) + + var maxRetries tcpip.TCPMaxRetriesOption + if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil { + panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err)) + } + s.maxRetries = uint32(maxRetries) + return s } @@ -392,8 +440,8 @@ func (s *sender) updateRTO(rtt time.Duration) { s.rto = s.rtt.srtt + 4*s.rtt.rttvar s.rtt.Unlock() - if s.rto < minRTO { - s.rto = minRTO + if s.rto < s.minRTO { + s.rto = s.minRTO } } @@ -435,17 +483,56 @@ func (s *sender) retransmitTimerExpired() bool { return true } + // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases + // when writeList is empty. Remove this once we have a proper fix for this + // issue. + if s.writeList.Front() == nil { + return true + } + s.ep.stack.Stats().TCP.Timeouts.Increment() s.ep.stats.SendErrors.Timeouts.Increment() - // Give up if we've waited more than a minute since the last resend. - if s.rto >= 60*time.Second { + // Give up if we've waited more than a minute since the last resend or + // if a user time out is set and we have exceeded the user specified + // timeout since the first retransmission. + uto := s.ep.userTimeout + + if s.firstRetransmittedSegXmitTime.IsZero() { + // We store the original xmitTime of the segment that we are + // about to retransmit as the retransmission time. This is + // required as by the time the retransmitTimer has expired the + // segment has already been sent and unacked for the RTO at the + // time the segment was sent. + s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime + } + + elapsed := time.Since(s.firstRetransmittedSegXmitTime) + remaining := s.maxRTO + if uto != 0 { + // Cap to the user specified timeout if one is specified. + remaining = uto - elapsed + } + + // Always honor the user-timeout irrespective of whether the zero + // window probes were acknowledged. + // net/ipv4/tcp_timer.c::tcp_probe_timer() + if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries { return false } // Set new timeout. The timer will be restarted by the call to sendData // below. s.rto *= 2 + // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5 + if s.rto > s.maxRTO { + s.rto = s.maxRTO + } + + // Cap RTO to remaining time. + if s.rto > remaining { + s.rto = remaining + } // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. // @@ -488,6 +575,26 @@ func (s *sender) retransmitTimerExpired() bool { // information is usable after an RTO. s.ep.scoreboard.Reset() s.writeNext = s.writeList.Front() + + // RFC 1122 4.2.2.17: Start sending zero window probes when we still see a + // zero receive window after retransmission interval and we have data to + // send. + if s.zeroWindowProbing { + s.sendZeroWindowProbe() + // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed + // indefinitely. As long as the receiving TCP continues to send + // acknowledgments in response to the probe segments, the sending TCP + // MUST allow the connection to stay open. + return true + } + + seg := s.writeNext + // RFC 1122 4.2.3.5: Close the connection when the number of + // retransmissions for this segment is beyond a limit. + if seg != nil && seg.xmitCount > s.maxRetries { + return false + } + s.sendData() return true @@ -515,25 +622,51 @@ func (s *sender) splitSeg(seg *segment, size int) { nSeg.data.TrimFront(size) nSeg.sequenceNumber.UpdateForward(seqnum.Size(size)) s.writeList.InsertAfter(seg, nSeg) + + // The segment being split does not carry PUSH flag because it is + // followed by the newly split segment. + // RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered + // segment (i.e., when there is no more queued data to be sent). + // Linux removes PSH flag only when the segment is being split over MSS + // and retains it when we are splitting the segment over lack of sender + // window space. + // ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point() + // ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test() + if seg.data.Size() > s.maxPayloadSize { + seg.flags ^= header.TCPFlagPsh + } + seg.data.CapLength(size) } -// NextSeg implements the RFC6675 NextSeg() operation. It returns segments that -// match rule 1, 3 and 4 of the NextSeg() operation defined in RFC6675. Rule 2 -// is handled by the normal send logic. -func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { +// NextSeg implements the RFC6675 NextSeg() operation. +// +// NextSeg starts scanning the writeList starting from nextSegHint and returns +// the hint to be passed on the next call to NextSeg. This is required to avoid +// iterating the write list repeatedly when NextSeg is invoked in a loop during +// recovery. The returned hint will be nil if there are no more segments that +// can match rules defined by NextSeg operation in RFC6675. +// +// rescueRtx will be true only if nextSeg is a rescue retransmission as +// described by Step 4) of the NextSeg algorithm. +func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) { var s3 *segment var s4 *segment - smss := s.ep.scoreboard.SMSS() // Step 1. - for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { - if !s.isAssignedSequenceNumber(seg) { + for seg := nextSegHint; seg != nil; seg = seg.Next() { + // Stop iteration if we hit a segment that has never been + // transmitted (i.e. either it has no assigned sequence number + // or if it does have one, it's >= the next sequence number + // to be sent [i.e. >= s.sndNxt]). + if !s.isAssignedSequenceNumber(seg) || s.sndNxt.LessThanEq(seg.sequenceNumber) { + hint = nil break } segSeq := seg.sequenceNumber - if seg.data.Size() > int(smss) { + if smss := s.ep.scoreboard.SMSS(); seg.data.Size() > int(smss) { s.splitSeg(seg, int(smss)) } + // See RFC 6675 Section 4 // // 1. If there exists a smallest unSACKED sequence number @@ -550,8 +683,9 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { // NextSeg(): // (1.c) IsLost(S2) returns true. if s.ep.scoreboard.IsLost(segSeq) { - return seg, s3, s4 + return seg, seg.Next(), false } + // NextSeg(): // // (3): If the conditions for rules (1) and (2) @@ -563,6 +697,7 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { // SHOULD be returned. if s3 == nil { s3 = seg + hint = seg.Next() } } // NextSeg(): @@ -571,10 +706,12 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { // but there exists outstanding unSACKED data, we // provide the opportunity for a single "rescue" // retransmission per entry into loss recovery. If - // HighACK is greater than RescueRxt, the one - // segment of upto SMSS octects that MUST include - // the highest outstanding unSACKed sequence number - // SHOULD be returned. + // HighACK is greater than RescueRxt (or RescueRxt + // is undefined), then one segment of upto SMSS + // octects that MUST include the highest outstanding + // unSACKed sequence number SHOULD be returned, and + // RescueRxt set to RecoveryPoint. HighRxt MUST NOT + // be updated. if s.fr.rescueRxt.LessThan(s.sndUna - 1) { if s4 != nil { if s4.sequenceNumber.LessThan(segSeq) { @@ -583,12 +720,31 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) { } else { s4 = seg } - s.fr.rescueRxt = s.fr.last } } } - return nil, s3, s4 + // If we got here then no segment matched step (1). + // Step (2): "If no sequence number 'S2' per rule (1) + // exists but there exists available unsent data and the + // receiver's advertised window allows, the sequence + // range of one segment of up to SMSS octets of + // previously unsent data starting with sequence number + // HighData+1 MUST be returned." + for seg := s.writeNext; seg != nil; seg = seg.Next() { + if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) { + continue + } + // We do not split the segment here to <= smss as it has + // potentially not been assigned a sequence number yet. + return seg, nil, false + } + + if s3 != nil { + return s3, hint, false + } + + return s4, nil, true } // maybeSendSegment tries to send the specified segment and either coalesces @@ -601,7 +757,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se if !s.isAssignedSequenceNumber(seg) { // Merge segments if allowed. if seg.data.Size() != 0 { - available := int(seg.sequenceNumber.Size(end)) + available := int(s.sndNxt.Size(end)) if available > limit { available = limit } @@ -644,8 +800,11 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se // sent all at once. return false } - if atomic.LoadUint32(&s.ep.cork) != 0 { - // Hold back the segment until full. + // With TCP_CORK, hold back until minimum of the available + // send space and MSS. + // TODO(gvisor.dev/issue/2833): Drain the held segments after a + // timeout. + if seg.data.Size() < s.maxPayloadSize && atomic.LoadUint32(&s.ep.cork) != 0 { return false } } @@ -664,18 +823,14 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se } seg.flags = header.TCPFlagAck | header.TCPFlagFin segEnd = seg.sequenceNumber.Add(1) - // Transition to FIN-WAIT1 state since we're initiating an active close. - s.ep.mu.Lock() - switch s.ep.state { + // Update the state to reflect that we have now + // queued a FIN. + switch s.ep.EndpointState() { case StateCloseWait: - // We've already received a FIN and are now sending our own. The - // sender is now awaiting a final ACK for this FIN. - s.ep.state = StateLastAck + s.ep.setEndpointState(StateLastAck) default: - s.ep.state = StateFinWait1 + s.ep.setEndpointState(StateFinWait1) } - s.ep.stack.Stats().TCP.CurrentEstablished.Decrement() - s.ep.mu.Unlock() } else { // We're sending a non-FIN segment. if seg.flags&header.TCPFlagFin != 0 { @@ -690,10 +845,52 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se if available == 0 { return false } + + // If the whole segment or at least 1MSS sized segment cannot + // be accomodated in the receiver advertized window, skip + // splitting and sending of the segment. ref: + // net/ipv4/tcp_output.c::tcp_snd_wnd_test() + // + // Linux checks this for all segment transmits not triggered by + // a probe timer. On this condition, it defers the segment split + // and transmit to a short probe timer. + // + // ref: include/net/tcp.h::tcp_check_probe_timer() + // ref: net/ipv4/tcp_output.c::tcp_write_wakeup() + // + // Instead of defining a new transmit timer, we attempt to split + // the segment right here if there are no pending segments. If + // there are pending segments, segment transmits are deferred to + // the retransmit timer handler. + if s.sndUna != s.sndNxt { + switch { + case available >= seg.data.Size(): + // OK to send, the whole segments fits in the + // receiver's advertised window. + case available >= s.maxPayloadSize: + // OK to send, at least 1 MSS sized segment fits + // in the receiver's advertised window. + default: + return false + } + } + + // The segment size limit is computed as a function of sender + // congestion window and MSS. When sender congestion window is > + // 1, this limit can be larger than MSS. Ensure that the + // currently available send space is not greater than minimum of + // this limit and MSS. if available > limit { available = limit } + // If GSO is not in use then cap available to + // maxPayloadSize. When GSO is in use the gVisor GSO logic or + // the host GSO logic will cap the segment to the correct size. + if s.ep.gso == nil && available > s.maxPayloadSize { + available = s.maxPayloadSize + } + if seg.data.Size() > available { s.splitSeg(seg, available) } @@ -716,64 +913,47 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se // section 5, step C. func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) { s.SetPipe() + + if smss := int(s.ep.scoreboard.SMSS()); limit > smss { + // Cap segment size limit to s.smss as SACK recovery requires + // that all retransmissions or new segments send during recovery + // be of <= SMSS. + limit = smss + } + + nextSegHint := s.writeList.Front() for s.outstanding < s.sndCwnd { - nextSeg, s3, s4 := s.NextSeg() + var nextSeg *segment + var rescueRtx bool + nextSeg, nextSegHint, rescueRtx = s.NextSeg(nextSegHint) if nextSeg == nil { - // NextSeg(): - // - // Step (2): "If no sequence number 'S2' per rule (1) - // exists but there exists available unsent data and the - // receiver's advertised window allows, the sequence - // range of one segment of up to SMSS octets of - // previously unsent data starting with sequence number - // HighData+1 MUST be returned." - for seg := s.writeNext; seg != nil; seg = seg.Next() { - if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) { - continue - } - // Step C.3 described below is handled by - // maybeSendSegment which increments sndNxt when - // a segment is transmitted. - // - // Step C.3 "If any of the data octets sent in - // (C.1) are above HighData, HighData must be - // updated to reflect the transmission of - // previously unsent data." - if sent := s.maybeSendSegment(seg, limit, end); !sent { - break - } - dataSent = true - s.outstanding++ - s.writeNext = seg.Next() - nextSeg = seg - break - } - if nextSeg != nil { - continue - } - } - rescueRtx := false - if nextSeg == nil && s3 != nil { - nextSeg = s3 - } - if nextSeg == nil && s4 != nil { - nextSeg = s4 - rescueRtx = true + return dataSent } - if nextSeg == nil { - break - } - segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen()) - if !rescueRtx && nextSeg.sequenceNumber.LessThan(s.sndNxt) { - // RFC 6675, Step C.2 + if !s.isAssignedSequenceNumber(nextSeg) || s.sndNxt.LessThanEq(nextSeg.sequenceNumber) { + // New data being sent. + + // Step C.3 described below is handled by + // maybeSendSegment which increments sndNxt when + // a segment is transmitted. // - // "If any of the data octets sent in (C.1) are below - // HighData, HighRxt MUST be set to the highest sequence - // number of the retransmitted segment unless NextSeg () - // rule (4) was invoked for this retransmission." - s.fr.highRxt = segEnd - 1 + // Step C.3 "If any of the data octets sent in + // (C.1) are above HighData, HighData must be + // updated to reflect the transmission of + // previously unsent data." + // + // We pass s.smss as the limit as the Step 2) requires that + // new data sent should be of size s.smss or less. + if sent := s.maybeSendSegment(nextSeg, limit, end); !sent { + return dataSent + } + dataSent = true + s.outstanding++ + s.writeNext = nextSeg.Next() + continue } + // Now handle the retransmission case where we matched either step 1,3 or 4 + // of the NextSeg algorithm. // RFC 6675, Step C.4. // // "The estimate of the amount of data outstanding in the network @@ -782,10 +962,54 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) s.outstanding++ dataSent = true s.sendSegment(nextSeg) + + segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen()) + if rescueRtx { + // We do the last part of rule (4) of NextSeg here to update + // RescueRxt as until this point we don't know if we are going + // to use the rescue transmission. + s.fr.rescueRxt = s.fr.last + } else { + // RFC 6675, Step C.2 + // + // "If any of the data octets sent in (C.1) are below + // HighData, HighRxt MUST be set to the highest sequence + // number of the retransmitted segment unless NextSeg () + // rule (4) was invoked for this retransmission." + s.fr.highRxt = segEnd - 1 + } } return dataSent } +func (s *sender) sendZeroWindowProbe() { + ack, win := s.ep.rcv.getSendParams() + s.unackZeroWindowProbes++ + // Send a zero window probe with sequence number pointing to + // the last acknowledged byte. + s.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, s.sndUna-1, ack, win) + // Rearm the timer to continue probing. + s.resendTimer.enable(s.rto) +} + +func (s *sender) enableZeroWindowProbing() { + s.zeroWindowProbing = true + // We piggyback the probing on the retransmit timer with the + // current retranmission interval, as we may start probing while + // segment retransmissions. + if s.firstRetransmittedSegXmitTime.IsZero() { + s.firstRetransmittedSegXmitTime = time.Now() + } + s.resendTimer.enable(s.rto) +} + +func (s *sender) disableZeroWindowProbing() { + s.zeroWindowProbing = false + s.unackZeroWindowProbes = 0 + s.firstRetransmittedSegXmitTime = time.Time{} + s.resendTimer.disable() +} + // sendData sends new data segments. It is called when data becomes available or // when the send window opens up. func (s *sender) sendData() { @@ -799,7 +1023,7 @@ func (s *sender) sendData() { // "A TCP SHOULD set cwnd to no more than RW before beginning // transmission if the TCP has not sent data in the interval exceeding // the retrasmission timeout." - if !s.fr.active && time.Now().Sub(s.lastSendTime) > s.rto { + if !s.fr.active && s.state != RTORecovery && time.Now().Sub(s.lastSendTime) > s.rto { if s.sndCwnd > InitialCwnd { s.sndCwnd = InitialCwnd } @@ -817,6 +1041,9 @@ func (s *sender) sendData() { limit = cwndLimit } if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { + // Move writeNext along so that we don't try and scan data that + // has already been SACKED. + s.writeNext = seg.Next() continue } if sent := s.maybeSendSegment(seg, limit, end); !sent { @@ -834,6 +1061,13 @@ func (s *sender) sendData() { s.ep.disableKeepaliveTimer() } + // If the sender has advertized zero receive window and we have + // data to be sent out, start zero window probing to query the + // the remote for it's receive window size. + if s.writeNext != nil && s.sndWnd == 0 { + s.enableZeroWindowProbing() + } + // Enable the timer if we have pending data and it's not enabled yet. if !s.resendTimer.enabled() && s.sndUna != s.sndNxt { s.resendTimer.enable(s.rto) @@ -855,6 +1089,8 @@ func (s *sender) enterFastRecovery() { s.fr.first = s.sndUna s.fr.last = s.sndNxt - 1 s.fr.maxCwnd = s.sndCwnd + s.outstanding + s.fr.highRxt = s.sndUna + s.fr.rescueRxt = s.sndUna if s.ep.sackPermitted { s.state = SACKRecovery s.ep.stack.Stats().TCP.SACKRecovery.Increment() @@ -1040,21 +1276,21 @@ func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) { // handleRcvdSegment is called when a segment is received; it is responsible for // updating the send-related state. -func (s *sender) handleRcvdSegment(seg *segment) { +func (s *sender) handleRcvdSegment(rcvdSeg *segment) { // Check if we can extract an RTT measurement from this ack. - if !seg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(seg.ackNumber) { + if !rcvdSeg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(rcvdSeg.ackNumber) { s.updateRTO(time.Now().Sub(s.rttMeasureTime)) s.rttMeasureSeqNum = s.sndNxt } // Update Timestamp if required. See RFC7323, section-4.3. - if s.ep.sendTSOk && seg.parsedOptions.TS { - s.ep.updateRecentTimestamp(seg.parsedOptions.TSVal, s.maxSentAck, seg.sequenceNumber) + if s.ep.sendTSOk && rcvdSeg.parsedOptions.TS { + s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.maxSentAck, rcvdSeg.sequenceNumber) } // Insert SACKBlock information into our scoreboard. if s.ep.sackPermitted { - for _, sb := range seg.parsedOptions.SACKBlocks { + for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { // Only insert the SACK block if the following holds // true: // * SACK block acks data after the ack number in the @@ -1067,22 +1303,40 @@ func (s *sender) handleRcvdSegment(seg *segment) { // NOTE: This check specifically excludes DSACK blocks // which have start/end before sndUna and are used to // indicate spurious retransmissions. - if seg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) { + if rcvdSeg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) { s.ep.scoreboard.Insert(sb) - seg.hasNewSACKInfo = true + rcvdSeg.hasNewSACKInfo = true } } s.SetPipe() } // Count the duplicates and do the fast retransmit if needed. - rtx := s.checkDuplicateAck(seg) + rtx := s.checkDuplicateAck(rcvdSeg) // Stash away the current window size. - s.sndWnd = seg.window + s.sndWnd = rcvdSeg.window + + ack := rcvdSeg.ackNumber + + // Disable zero window probing if remote advertizes a non-zero receive + // window. This can be with an ACK to the zero window probe (where the + // acknumber refers to the already acknowledged byte) OR to any previously + // unacknowledged segment. + if s.zeroWindowProbing && rcvdSeg.window > 0 && + (ack == s.sndUna || (ack-1).InRange(s.sndUna, s.sndNxt)) { + s.disableZeroWindowProbing() + } + + // On receiving the ACK for the zero window probe, account for it and + // skip trying to send any segment as we are still probing for + // receive window to become non-zero. + if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.sndUna { + s.unackZeroWindowProbes-- + return + } // Ignore ack if it doesn't acknowledge any new data. - ack := seg.ackNumber if (ack - 1).InRange(s.sndUna, s.sndNxt) { s.dupAckCount = 0 @@ -1094,15 +1348,15 @@ func (s *sender) handleRcvdSegment(seg *segment) { // averaged RTT measurement only if the segment acknowledges // some new data, i.e., only if it advances the left edge of // the send window. - if s.ep.sendTSOk && seg.parsedOptions.TSEcr != 0 { + if s.ep.sendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 { // TSVal/Ecr values sent by Netstack are at a millisecond // granularity. - elapsed := time.Duration(s.ep.timestamp()-seg.parsedOptions.TSEcr) * time.Millisecond + elapsed := time.Duration(s.ep.timestamp()-rcvdSeg.parsedOptions.TSEcr) * time.Millisecond s.updateRTO(elapsed) } // When an ack is received we must rearm the timer. - // RFC 6298 5.2 + // RFC 6298 5.3 s.resendTimer.enable(s.rto) // Remove all acknowledged data from the write list. @@ -1111,6 +1365,9 @@ func (s *sender) handleRcvdSegment(seg *segment) { ackLeft := acked originalOutstanding := s.outstanding + s.rtt.Lock() + srtt := s.rtt.srtt + s.rtt.Unlock() for ackLeft > 0 { // We use logicalLen here because we can have FIN // segments (which are always at the end of list) that @@ -1129,6 +1386,12 @@ func (s *sender) handleRcvdSegment(seg *segment) { if s.writeNext == seg { s.writeNext = seg.Next() } + + // Update the RACK fields if SACK is enabled. + if s.ep.sackPermitted { + s.rc.Update(seg, rcvdSeg, srtt, s.ep.tsOffset) + } + s.writeList.Remove(seg) // if SACK is enabled then Only reduce outstanding if @@ -1169,6 +1432,8 @@ func (s *sender) handleRcvdSegment(seg *segment) { // RFC 6298 Rule 5.3 if s.sndUna == s.sndNxt { s.outstanding = 0 + // Reset firstRetransmittedSegXmitTime to the zero value. + s.firstRetransmittedSegXmitTime = time.Time{} s.resendTimer.disable() } } @@ -1182,14 +1447,14 @@ func (s *sender) handleRcvdSegment(seg *segment) { // that the window opened up, or the congestion window was inflated due // to a duplicate ack during fast recovery. This will also re-enable // the retransmit timer if needed. - if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || seg.hasNewSACKInfo { + if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || rcvdSeg.hasNewSACKInfo { s.sendData() } } // sendSegment sends the specified segment. func (s *sender) sendSegment(seg *segment) *tcpip.Error { - if !seg.xmitTime.IsZero() { + if seg.xmitCount > 0 { s.ep.stack.Stats().TCP.Retransmits.Increment() s.ep.stats.SendErrors.Retransmits.Increment() if s.sndCwnd < s.sndSsthresh { @@ -1197,7 +1462,24 @@ func (s *sender) sendSegment(seg *segment) *tcpip.Error { } } seg.xmitTime = time.Now() - return s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber) + seg.xmitCount++ + err := s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber) + + // Every time a packet containing data is sent (including a + // retransmission), if SACK is enabled and we are retransmitting data + // then use the conservative timer described in RFC6675 Section 6.0, + // otherwise follow the standard time described in RFC6298 Section 5.1. + if err != nil && seg.data.Size() != 0 { + if s.fr.active && seg.xmitCount > 1 && s.ep.sackPermitted { + s.resendTimer.enable(s.rto) + } else { + if !s.resendTimer.enabled() { + s.resendTimer.enable(s.rto) + } + } + } + + return err } // sendSegmentFromView sends a new segment containing the given payload, flags @@ -1213,19 +1495,5 @@ func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags byte, seq // Remember the max sent ack. s.maxSentAck = rcvNxt - // Every time a packet containing data is sent (including a - // retransmission), if SACK is enabled then use the conservative timer - // described in RFC6675 Section 4.0, otherwise follow the standard time - // described in RFC6298 Section 5.2. - if data.Size() != 0 { - if s.ep.sackPermitted { - s.resendTimer.enable(s.rto) - } else { - if !s.resendTimer.enabled() { - s.resendTimer.enable(s.rto) - } - } - } - return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd) } |