1 files changed, 390 insertions, 122 deletions
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index d3f7c9125..c55589c45 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -15,12 +15,13 @@
 package tcp
 
 import (
+	"fmt"
 	"math"
-	"sync"
 	"sync/atomic"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -28,8 +29,11 @@ import (
 )
 
 const (
-	// minRTO is the minimum allowed value for the retransmit timeout.
-	minRTO = 200 * time.Millisecond
+	// MinRTO is the minimum allowed value for the retransmit timeout.
+	MinRTO = 200 * time.Millisecond
+
+	// MaxRTO is the maximum allowed value for the retransmit timeout.
+	MaxRTO = 120 * time.Second
 
 	// InitialCwnd is the initial congestion window.
 	InitialCwnd = 10
@@ -37,6 +41,11 @@ const (
 	// nDupAckThreshold is the number of duplicate ACK's required
 	// before fast-retransmit is entered.
 	nDupAckThreshold = 3
+
+	// MaxRetries is the maximum number of probe retries sender does
+	// before timing out the connection.
+	// Linux default TCP_RETR2, net.ipv4.tcp_retries2.
+	MaxRetries = 15
 )
 
 // ccState indicates the current congestion control state for this sender.
@@ -123,10 +132,6 @@ type sender struct {
 	// sndNxt is the sequence number of the next segment to be sent.
 	sndNxt seqnum.Value
 
-	// sndNxtList is the sequence number of the next segment to be added to
-	// the send list.
-	sndNxtList seqnum.Value
-
 	// rttMeasureSeqNum is the sequence number being used for the latest RTT
 	// measurement.
 	rttMeasureSeqNum seqnum.Value
@@ -134,6 +139,18 @@ type sender struct {
 	// rttMeasureTime is the time when the rttMeasureSeqNum was sent.
 	rttMeasureTime time.Time `state:".(unixTime)"`
 
+	// firstRetransmittedSegXmitTime is the original transmit time of
+	// the first segment that was retransmitted due to RTO expiration.
+	firstRetransmittedSegXmitTime time.Time `state:".(unixTime)"`
+
+	// zeroWindowProbing is set if the sender is currently probing
+	// for zero receive window.
+	zeroWindowProbing bool `state:"nosave"`
+
+	// unackZeroWindowProbes is the number of unacknowledged zero
+	// window probes.
+	unackZeroWindowProbes uint32 `state:"nosave"`
+
 	closed      bool
 	writeNext   *segment
 	writeList   segmentList
@@ -146,6 +163,15 @@ type sender struct {
 	rtt rtt
 	rto time.Duration
 
+	// minRTO is the minimum permitted value for sender.rto.
+	minRTO time.Duration
+
+	// maxRTO is the maximum permitted value for sender.rto.
+	maxRTO time.Duration
+
+	// maxRetries is the maximum permitted retransmissions.
+	maxRetries uint32
+
 	// maxPayloadSize is the maximum size of the payload of a given segment.
 	// It is initialized on demand.
 	maxPayloadSize int
@@ -165,6 +191,10 @@ type sender struct {
 
 	// cc is the congestion control algorithm in use for this sender.
 	cc congestionControl
+
+	// rc has the fields needed for implementing RACK loss detection
+	// algorithm.
+	rc rackControl
 }
 
 // rtt is a synchronization wrapper used to appease stateify. See the comment
@@ -222,7 +252,6 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 		sndWnd:           sndWnd,
 		sndUna:           iss + 1,
 		sndNxt:           iss + 1,
-		sndNxtList:       iss + 1,
 		rto:              1 * time.Second,
 		rttMeasureSeqNum: iss + 1,
 		lastSendTime:     time.Now(),
@@ -258,6 +287,25 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 	// etc.
 	s.ep.scoreboard = NewSACKScoreboard(uint16(s.maxPayloadSize), iss)
 
+	// Get Stack wide config.
+	var minRTO tcpip.TCPMinRTOOption
+	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil {
+		panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
+	}
+	s.minRTO = time.Duration(minRTO)
+
+	var maxRTO tcpip.TCPMaxRTOOption
+	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil {
+		panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err))
+	}
+	s.maxRTO = time.Duration(maxRTO)
+
+	var maxRetries tcpip.TCPMaxRetriesOption
+	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil {
+		panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err))
+	}
+	s.maxRetries = uint32(maxRetries)
+
 	return s
 }
 
@@ -392,8 +440,8 @@ func (s *sender) updateRTO(rtt time.Duration) {
 
 	s.rto = s.rtt.srtt + 4*s.rtt.rttvar
 	s.rtt.Unlock()
-	if s.rto < minRTO {
-		s.rto = minRTO
+	if s.rto < s.minRTO {
+		s.rto = s.minRTO
 	}
 }
 
@@ -435,17 +483,56 @@ func (s *sender) retransmitTimerExpired() bool {
 		return true
 	}
 
+	// TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
+	// when writeList is empty. Remove this once we have a proper fix for this
+	// issue.
+	if s.writeList.Front() == nil {
+		return true
+	}
+
 	s.ep.stack.Stats().TCP.Timeouts.Increment()
 	s.ep.stats.SendErrors.Timeouts.Increment()
 
-	// Give up if we've waited more than a minute since the last resend.
-	if s.rto >= 60*time.Second {
+	// Give up if we've waited more than a minute since the last resend or
+	// if a user time out is set and we have exceeded the user specified
+	// timeout since the first retransmission.
+	uto := s.ep.userTimeout
+
+	if s.firstRetransmittedSegXmitTime.IsZero() {
+		// We store the original xmitTime of the segment that we are
+		// about to retransmit as the retransmission time. This is
+		// required as by the time the retransmitTimer has expired the
+		// segment has already been sent and unacked for the RTO at the
+		// time the segment was sent.
+		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
+	}
+
+	elapsed := time.Since(s.firstRetransmittedSegXmitTime)
+	remaining := s.maxRTO
+	if uto != 0 {
+		// Cap to the user specified timeout if one is specified.
+		remaining = uto - elapsed
+	}
+
+	// Always honor the user-timeout irrespective of whether the zero
+	// window probes were acknowledged.
+	// net/ipv4/tcp_timer.c::tcp_probe_timer()
+	if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries {
 		return false
 	}
 
 	// Set new timeout. The timer will be restarted by the call to sendData
 	// below.
 	s.rto *= 2
+	// Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5
+	if s.rto > s.maxRTO {
+		s.rto = s.maxRTO
+	}
+
+	// Cap RTO to remaining time.
+	if s.rto > remaining {
+		s.rto = remaining
+	}
 
 	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
 	//
@@ -488,6 +575,26 @@ func (s *sender) retransmitTimerExpired() bool {
 	// information is usable after an RTO.
 	s.ep.scoreboard.Reset()
 	s.writeNext = s.writeList.Front()
+
+	// RFC 1122 4.2.2.17: Start sending zero window probes when we still see a
+	// zero receive window after retransmission interval and we have data to
+	// send.
+	if s.zeroWindowProbing {
+		s.sendZeroWindowProbe()
+		// RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed
+		// indefinitely.  As long as the receiving TCP continues to send
+		// acknowledgments in response to the probe segments, the sending TCP
+		// MUST allow the connection to stay open.
+		return true
+	}
+
+	seg := s.writeNext
+	// RFC 1122 4.2.3.5: Close the connection when the number of
+	// retransmissions for this segment is beyond a limit.
+	if seg != nil && seg.xmitCount > s.maxRetries {
+		return false
+	}
+
 	s.sendData()
 
 	return true
@@ -515,25 +622,51 @@ func (s *sender) splitSeg(seg *segment, size int) {
 	nSeg.data.TrimFront(size)
 	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
 	s.writeList.InsertAfter(seg, nSeg)
+
+	// The segment being split does not carry PUSH flag because it is
+	// followed by the newly split segment.
+	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
+	// segment (i.e., when there is no more queued data to be sent).
+	// Linux removes PSH flag only when the segment is being split over MSS
+	// and retains it when we are splitting the segment over lack of sender
+	// window space.
+	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
+	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
+	if seg.data.Size() > s.maxPayloadSize {
+		seg.flags ^= header.TCPFlagPsh
+	}
+
 	seg.data.CapLength(size)
 }
 
-// NextSeg implements the RFC6675 NextSeg() operation. It returns segments that
-// match rule 1, 3 and 4 of the NextSeg() operation defined in RFC6675. Rule 2
-// is handled by the normal send logic.
-func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) {
+// NextSeg implements the RFC6675 NextSeg() operation.
+//
+// NextSeg starts scanning the writeList starting from nextSegHint and returns
+// the hint to be passed on the next call to NextSeg. This is required to avoid
+// iterating the write list repeatedly when NextSeg is invoked in a loop during
+// recovery. The returned hint will be nil if there are no more segments that
+// can match rules defined by NextSeg operation in RFC6675.
+//
+// rescueRtx will be true only if nextSeg is a rescue retransmission as
+// described by Step 4) of the NextSeg algorithm.
+func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) {
 	var s3 *segment
 	var s4 *segment
-	smss := s.ep.scoreboard.SMSS()
 	// Step 1.
-	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
-		if !s.isAssignedSequenceNumber(seg) {
+	for seg := nextSegHint; seg != nil; seg = seg.Next() {
+		// Stop iteration if we hit a segment that has never been
+		// transmitted (i.e. either it has no assigned sequence number
+		// or if it does have one, it's >= the next sequence number
+		// to be sent [i.e. >= s.sndNxt]).
+		if !s.isAssignedSequenceNumber(seg) || s.sndNxt.LessThanEq(seg.sequenceNumber) {
+			hint = nil
 			break
 		}
 		segSeq := seg.sequenceNumber
-		if seg.data.Size() > int(smss) {
+		if smss := s.ep.scoreboard.SMSS(); seg.data.Size() > int(smss) {
 			s.splitSeg(seg, int(smss))
 		}
+
 		// See RFC 6675 Section 4
 		//
 		//     1. If there exists a smallest unSACKED sequence number
@@ -550,8 +683,9 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) {
 				// NextSeg():
 				//     (1.c) IsLost(S2) returns true.
 				if s.ep.scoreboard.IsLost(segSeq) {
-					return seg, s3, s4
+					return seg, seg.Next(), false
 				}
+
 				// NextSeg():
 				//
 				// (3): If the conditions for rules (1) and (2)
@@ -563,6 +697,7 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) {
 				// SHOULD be returned.
 				if s3 == nil {
 					s3 = seg
+					hint = seg.Next()
 				}
 			}
 			// NextSeg():
@@ -571,10 +706,12 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) {
 			//     but there exists outstanding unSACKED data, we
 			//     provide the opportunity for a single "rescue"
 			//     retransmission per entry into loss recovery. If
-			//     HighACK is greater than RescueRxt, the one
-			//     segment of upto SMSS octects that MUST include
-			//     the highest outstanding unSACKed sequence number
-			//     SHOULD be returned.
+			//     HighACK is greater than RescueRxt (or RescueRxt
+			//     is undefined), then one segment of upto SMSS
+			//     octects that MUST include the highest outstanding
+			//     unSACKed sequence number SHOULD be returned, and
+			//     RescueRxt set to RecoveryPoint. HighRxt MUST NOT
+			//     be updated.
 			if s.fr.rescueRxt.LessThan(s.sndUna - 1) {
 				if s4 != nil {
 					if s4.sequenceNumber.LessThan(segSeq) {
@@ -583,12 +720,31 @@ func (s *sender) NextSeg() (nextSeg1, nextSeg3, nextSeg4 *segment) {
 				} else {
 					s4 = seg
 				}
-				s.fr.rescueRxt = s.fr.last
 			}
 		}
 	}
 
-	return nil, s3, s4
+	// If we got here then no segment matched step (1).
+	// Step (2): "If no sequence number 'S2' per rule (1)
+	// exists but there exists available unsent data and the
+	// receiver's advertised window allows, the sequence
+	// range of one segment of up to SMSS octets of
+	// previously unsent data starting with sequence number
+	// HighData+1 MUST be returned."
+	for seg := s.writeNext; seg != nil; seg = seg.Next() {
+		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) {
+			continue
+		}
+		// We do not split the segment here to <= smss as it has
+		// potentially not been assigned a sequence number yet.
+		return seg, nil, false
+	}
+
+	if s3 != nil {
+		return s3, hint, false
+	}
+
+	return s4, nil, true
 }
 
 // maybeSendSegment tries to send the specified segment and either coalesces
@@ -601,7 +757,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 	if !s.isAssignedSequenceNumber(seg) {
 		// Merge segments if allowed.
 		if seg.data.Size() != 0 {
-			available := int(seg.sequenceNumber.Size(end))
+			available := int(s.sndNxt.Size(end))
 			if available > limit {
 				available = limit
 			}
@@ -644,8 +800,11 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 					//   sent all at once.
 					return false
 				}
-				if atomic.LoadUint32(&s.ep.cork) != 0 {
-					// Hold back the segment until full.
+				// With TCP_CORK, hold back until minimum of the available
+				// send space and MSS.
+				// TODO(gvisor.dev/issue/2833): Drain the held segments after a
+				// timeout.
+				if seg.data.Size() < s.maxPayloadSize && atomic.LoadUint32(&s.ep.cork) != 0 {
 					return false
 				}
 			}
@@ -664,18 +823,14 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		}
 		seg.flags = header.TCPFlagAck | header.TCPFlagFin
 		segEnd = seg.sequenceNumber.Add(1)
-		// Transition to FIN-WAIT1 state since we're initiating an active close.
-		s.ep.mu.Lock()
-		switch s.ep.state {
+		// Update the state to reflect that we have now
+		// queued a FIN.
+		switch s.ep.EndpointState() {
 		case StateCloseWait:
-			// We've already received a FIN and are now sending our own. The
-			// sender is now awaiting a final ACK for this FIN.
-			s.ep.state = StateLastAck
+			s.ep.setEndpointState(StateLastAck)
 		default:
-			s.ep.state = StateFinWait1
+			s.ep.setEndpointState(StateFinWait1)
 		}
-		s.ep.stack.Stats().TCP.CurrentEstablished.Decrement()
-		s.ep.mu.Unlock()
 	} else {
 		// We're sending a non-FIN segment.
 		if seg.flags&header.TCPFlagFin != 0 {
@@ -690,10 +845,52 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		if available == 0 {
 			return false
 		}
+
+		// If the whole segment or at least 1MSS sized segment cannot
+		// be accomodated in the receiver advertized window, skip
+		// splitting and sending of the segment. ref:
+		// net/ipv4/tcp_output.c::tcp_snd_wnd_test()
+		//
+		// Linux checks this for all segment transmits not triggered by
+		// a probe timer. On this condition, it defers the segment split
+		// and transmit to a short probe timer.
+		//
+		// ref: include/net/tcp.h::tcp_check_probe_timer()
+		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
+		//
+		// Instead of defining a new transmit timer, we attempt to split
+		// the segment right here if there are no pending segments. If
+		// there are pending segments, segment transmits are deferred to
+		// the retransmit timer handler.
+		if s.sndUna != s.sndNxt {
+			switch {
+			case available >= seg.data.Size():
+				// OK to send, the whole segments fits in the
+				// receiver's advertised window.
+			case available >= s.maxPayloadSize:
+				// OK to send, at least 1 MSS sized segment fits
+				// in the receiver's advertised window.
+			default:
+				return false
+			}
+		}
+
+		// The segment size limit is computed as a function of sender
+		// congestion window and MSS. When sender congestion window is >
+		// 1, this limit can be larger than MSS. Ensure that the
+		// currently available send space is not greater than minimum of
+		// this limit and MSS.
 		if available > limit {
 			available = limit
 		}
 
+		// If GSO is not in use then cap available to
+		// maxPayloadSize. When GSO is in use the gVisor GSO logic or
+		// the host GSO logic will cap the segment to the correct size.
+		if s.ep.gso == nil && available > s.maxPayloadSize {
+			available = s.maxPayloadSize
+		}
+
 		if seg.data.Size() > available {
 			s.splitSeg(seg, available)
 		}
@@ -716,64 +913,47 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 // section 5, step C.
 func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) {
 	s.SetPipe()
+
+	if smss := int(s.ep.scoreboard.SMSS()); limit > smss {
+		// Cap segment size limit to s.smss as SACK recovery requires
+		// that all retransmissions or new segments send during recovery
+		// be of <= SMSS.
+		limit = smss
+	}
+
+	nextSegHint := s.writeList.Front()
 	for s.outstanding < s.sndCwnd {
-		nextSeg, s3, s4 := s.NextSeg()
+		var nextSeg *segment
+		var rescueRtx bool
+		nextSeg, nextSegHint, rescueRtx = s.NextSeg(nextSegHint)
 		if nextSeg == nil {
-			// NextSeg():
-			//
-			// Step (2): "If no sequence number 'S2' per rule (1)
-			// exists but there exists available unsent data and the
-			// receiver's advertised window allows, the sequence
-			// range of one segment of up to SMSS octets of
-			// previously unsent data starting with sequence number
-			// HighData+1 MUST be returned."
-			for seg := s.writeNext; seg != nil; seg = seg.Next() {
-				if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) {
-					continue
-				}
-				// Step C.3 described below is handled by
-				// maybeSendSegment which increments sndNxt when
-				// a segment is transmitted.
-				//
-				// Step C.3 "If any of the data octets sent in
-				// (C.1) are above HighData, HighData must be
-				// updated to reflect the transmission of
-				// previously unsent data."
-				if sent := s.maybeSendSegment(seg, limit, end); !sent {
-					break
-				}
-				dataSent = true
-				s.outstanding++
-				s.writeNext = seg.Next()
-				nextSeg = seg
-				break
-			}
-			if nextSeg != nil {
-				continue
-			}
-		}
-		rescueRtx := false
-		if nextSeg == nil && s3 != nil {
-			nextSeg = s3
-		}
-		if nextSeg == nil && s4 != nil {
-			nextSeg = s4
-			rescueRtx = true
+			return dataSent
 		}
-		if nextSeg == nil {
-			break
-		}
-		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
-		if !rescueRtx && nextSeg.sequenceNumber.LessThan(s.sndNxt) {
-			// RFC 6675, Step C.2
+		if !s.isAssignedSequenceNumber(nextSeg) || s.sndNxt.LessThanEq(nextSeg.sequenceNumber) {
+			// New data being sent.
+
+			// Step C.3 described below is handled by
+			// maybeSendSegment which increments sndNxt when
+			// a segment is transmitted.
 			//
-			// "If any of the data octets sent in (C.1) are below
-			// HighData, HighRxt MUST be set to the highest sequence
-			// number of the retransmitted segment unless NextSeg ()
-			// rule (4) was invoked for this retransmission."
-			s.fr.highRxt = segEnd - 1
+			// Step C.3 "If any of the data octets sent in
+			// (C.1) are above HighData, HighData must be
+			// updated to reflect the transmission of
+			// previously unsent data."
+			//
+			// We pass s.smss as the limit as the Step 2) requires that
+			// new data sent should be of size s.smss or less.
+			if sent := s.maybeSendSegment(nextSeg, limit, end); !sent {
+				return dataSent
+			}
+			dataSent = true
+			s.outstanding++
+			s.writeNext = nextSeg.Next()
+			continue
 		}
 
+		// Now handle the retransmission case where we matched either step 1,3 or 4
+		// of the NextSeg algorithm.
 		// RFC 6675, Step C.4.
 		//
 		// "The estimate of the amount of data outstanding in the network
@@ -782,10 +962,54 @@ func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool)
 		s.outstanding++
 		dataSent = true
 		s.sendSegment(nextSeg)
+
+		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
+		if rescueRtx {
+			// We do the last part of rule (4) of NextSeg here to update
+			// RescueRxt as until this point we don't know if we are going
+			// to use the rescue transmission.
+			s.fr.rescueRxt = s.fr.last
+		} else {
+			// RFC 6675, Step C.2
+			//
+			// "If any of the data octets sent in (C.1) are below
+			// HighData, HighRxt MUST be set to the highest sequence
+			// number of the retransmitted segment unless NextSeg ()
+			// rule (4) was invoked for this retransmission."
+			s.fr.highRxt = segEnd - 1
+		}
 	}
 	return dataSent
 }
 
+func (s *sender) sendZeroWindowProbe() {
+	ack, win := s.ep.rcv.getSendParams()
+	s.unackZeroWindowProbes++
+	// Send a zero window probe with sequence number pointing to
+	// the last acknowledged byte.
+	s.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, s.sndUna-1, ack, win)
+	// Rearm the timer to continue probing.
+	s.resendTimer.enable(s.rto)
+}
+
+func (s *sender) enableZeroWindowProbing() {
+	s.zeroWindowProbing = true
+	// We piggyback the probing on the retransmit timer with the
+	// current retranmission interval, as we may start probing while
+	// segment retransmissions.
+	if s.firstRetransmittedSegXmitTime.IsZero() {
+		s.firstRetransmittedSegXmitTime = time.Now()
+	}
+	s.resendTimer.enable(s.rto)
+}
+
+func (s *sender) disableZeroWindowProbing() {
+	s.zeroWindowProbing = false
+	s.unackZeroWindowProbes = 0
+	s.firstRetransmittedSegXmitTime = time.Time{}
+	s.resendTimer.disable()
+}
+
 // sendData sends new data segments. It is called when data becomes available or
 // when the send window opens up.
 func (s *sender) sendData() {
@@ -799,7 +1023,7 @@ func (s *sender) sendData() {
 	// "A TCP SHOULD set cwnd to no more than RW before beginning
 	// transmission if the TCP has not sent data in the interval exceeding
 	// the retrasmission timeout."
-	if !s.fr.active && time.Now().Sub(s.lastSendTime) > s.rto {
+	if !s.fr.active && s.state != RTORecovery && time.Now().Sub(s.lastSendTime) > s.rto {
 		if s.sndCwnd > InitialCwnd {
 			s.sndCwnd = InitialCwnd
 		}
@@ -817,6 +1041,9 @@ func (s *sender) sendData() {
 				limit = cwndLimit
 			}
 			if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
+				// Move writeNext along so that we don't try and scan data that
+				// has already been SACKED.
+				s.writeNext = seg.Next()
 				continue
 			}
 			if sent := s.maybeSendSegment(seg, limit, end); !sent {
@@ -834,6 +1061,13 @@ func (s *sender) sendData() {
 		s.ep.disableKeepaliveTimer()
 	}
 
+	// If the sender has advertized zero receive window and we have
+	// data to be sent out, start zero window probing to query the
+	// the remote for it's receive window size.
+	if s.writeNext != nil && s.sndWnd == 0 {
+		s.enableZeroWindowProbing()
+	}
+
 	// Enable the timer if we have pending data and it's not enabled yet.
 	if !s.resendTimer.enabled() && s.sndUna != s.sndNxt {
 		s.resendTimer.enable(s.rto)
@@ -855,6 +1089,8 @@ func (s *sender) enterFastRecovery() {
 	s.fr.first = s.sndUna
 	s.fr.last = s.sndNxt - 1
 	s.fr.maxCwnd = s.sndCwnd + s.outstanding
+	s.fr.highRxt = s.sndUna
+	s.fr.rescueRxt = s.sndUna
 	if s.ep.sackPermitted {
 		s.state = SACKRecovery
 		s.ep.stack.Stats().TCP.SACKRecovery.Increment()
@@ -1040,21 +1276,21 @@ func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) {
 
 // handleRcvdSegment is called when a segment is received; it is responsible for
 // updating the send-related state.
-func (s *sender) handleRcvdSegment(seg *segment) {
+func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 	// Check if we can extract an RTT measurement from this ack.
-	if !seg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(seg.ackNumber) {
+	if !rcvdSeg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(rcvdSeg.ackNumber) {
 		s.updateRTO(time.Now().Sub(s.rttMeasureTime))
 		s.rttMeasureSeqNum = s.sndNxt
 	}
 
 	// Update Timestamp if required. See RFC7323, section-4.3.
-	if s.ep.sendTSOk && seg.parsedOptions.TS {
-		s.ep.updateRecentTimestamp(seg.parsedOptions.TSVal, s.maxSentAck, seg.sequenceNumber)
+	if s.ep.sendTSOk && rcvdSeg.parsedOptions.TS {
+		s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.maxSentAck, rcvdSeg.sequenceNumber)
 	}
 
 	// Insert SACKBlock information into our scoreboard.
 	if s.ep.sackPermitted {
-		for _, sb := range seg.parsedOptions.SACKBlocks {
+		for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
 			// Only insert the SACK block if the following holds
 			// true:
 			//  * SACK block acks data after the ack number in the
@@ -1067,22 +1303,40 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 			// NOTE: This check specifically excludes DSACK blocks
 			// which have start/end before sndUna and are used to
 			// indicate spurious retransmissions.
-			if seg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
+			if rcvdSeg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
 				s.ep.scoreboard.Insert(sb)
-				seg.hasNewSACKInfo = true
+				rcvdSeg.hasNewSACKInfo = true
 			}
 		}
 		s.SetPipe()
 	}
 
 	// Count the duplicates and do the fast retransmit if needed.
-	rtx := s.checkDuplicateAck(seg)
+	rtx := s.checkDuplicateAck(rcvdSeg)
 
 	// Stash away the current window size.
-	s.sndWnd = seg.window
+	s.sndWnd = rcvdSeg.window
+
+	ack := rcvdSeg.ackNumber
+
+	// Disable zero window probing if remote advertizes a non-zero receive
+	// window. This can be with an ACK to the zero window probe (where the
+	// acknumber refers to the already acknowledged byte) OR to any previously
+	// unacknowledged segment.
+	if s.zeroWindowProbing && rcvdSeg.window > 0 &&
+		(ack == s.sndUna || (ack-1).InRange(s.sndUna, s.sndNxt)) {
+		s.disableZeroWindowProbing()
+	}
+
+	// On receiving the ACK for the zero window probe, account for it and
+	// skip trying to send any segment as we are still probing for
+	// receive window to become non-zero.
+	if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.sndUna {
+		s.unackZeroWindowProbes--
+		return
+	}
 
 	// Ignore ack if it doesn't acknowledge any new data.
-	ack := seg.ackNumber
 	if (ack - 1).InRange(s.sndUna, s.sndNxt) {
 		s.dupAckCount = 0
 
@@ -1094,15 +1348,15 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 		//    averaged RTT measurement only if the segment acknowledges
 		//    some new data, i.e., only if it advances the left edge of
 		//    the send window.
-		if s.ep.sendTSOk && seg.parsedOptions.TSEcr != 0 {
+		if s.ep.sendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 {
 			// TSVal/Ecr values sent by Netstack are at a millisecond
 			// granularity.
-			elapsed := time.Duration(s.ep.timestamp()-seg.parsedOptions.TSEcr) * time.Millisecond
+			elapsed := time.Duration(s.ep.timestamp()-rcvdSeg.parsedOptions.TSEcr) * time.Millisecond
 			s.updateRTO(elapsed)
 		}
 
 		// When an ack is received we must rearm the timer.
-		// RFC 6298 5.2
+		// RFC 6298 5.3
 		s.resendTimer.enable(s.rto)
 
 		// Remove all acknowledged data from the write list.
@@ -1111,6 +1365,9 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 
 		ackLeft := acked
 		originalOutstanding := s.outstanding
+		s.rtt.Lock()
+		srtt := s.rtt.srtt
+		s.rtt.Unlock()
 		for ackLeft > 0 {
 			// We use logicalLen here because we can have FIN
 			// segments (which are always at the end of list) that
@@ -1129,6 +1386,12 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 			if s.writeNext == seg {
 				s.writeNext = seg.Next()
 			}
+
+			// Update the RACK fields if SACK is enabled.
+			if s.ep.sackPermitted {
+				s.rc.Update(seg, rcvdSeg, srtt, s.ep.tsOffset)
+			}
+
 			s.writeList.Remove(seg)
 
 			// if SACK is enabled then Only reduce outstanding if
@@ -1169,6 +1432,8 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 		// RFC 6298 Rule 5.3
 		if s.sndUna == s.sndNxt {
 			s.outstanding = 0
+			// Reset firstRetransmittedSegXmitTime to the zero value.
+			s.firstRetransmittedSegXmitTime = time.Time{}
 			s.resendTimer.disable()
 		}
 	}
@@ -1182,14 +1447,14 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 	// that the window opened up, or the congestion window was inflated due
 	// to a duplicate ack during fast recovery. This will also re-enable
 	// the retransmit timer if needed.
-	if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || seg.hasNewSACKInfo {
+	if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || rcvdSeg.hasNewSACKInfo {
 		s.sendData()
 	}
 }
 
 // sendSegment sends the specified segment.
 func (s *sender) sendSegment(seg *segment) *tcpip.Error {
-	if !seg.xmitTime.IsZero() {
+	if seg.xmitCount > 0 {
 		s.ep.stack.Stats().TCP.Retransmits.Increment()
 		s.ep.stats.SendErrors.Retransmits.Increment()
 		if s.sndCwnd < s.sndSsthresh {
@@ -1197,7 +1462,24 @@ func (s *sender) sendSegment(seg *segment) *tcpip.Error {
 		}
 	}
 	seg.xmitTime = time.Now()
-	return s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber)
+	seg.xmitCount++
+	err := s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber)
+
+	// Every time a packet containing data is sent (including a
+	// retransmission), if SACK is enabled and we are retransmitting data
+	// then use the conservative timer described in RFC6675 Section 6.0,
+	// otherwise follow the standard time described in RFC6298 Section 5.1.
+	if err != nil && seg.data.Size() != 0 {
+		if s.fr.active && seg.xmitCount > 1 && s.ep.sackPermitted {
+			s.resendTimer.enable(s.rto)
+		} else {
+			if !s.resendTimer.enabled() {
+				s.resendTimer.enable(s.rto)
+			}
+		}
+	}
+
+	return err
 }
 
 // sendSegmentFromView sends a new segment containing the given payload, flags
@@ -1213,19 +1495,5 @@ func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags byte, seq
 	// Remember the max sent ack.
 	s.maxSentAck = rcvNxt
 
-	// Every time a packet containing data is sent (including a
-	// retransmission), if SACK is enabled then use the conservative timer
-	// described in RFC6675 Section 4.0, otherwise follow the standard time
-	// described in RFC6298 Section 5.2.
-	if data.Size() != 0 {
-		if s.ep.sackPermitted {
-			s.resendTimer.enable(s.rto)
-		} else {
-			if !s.resendTimer.enabled() {
-				s.resendTimer.enable(s.rto)
-			}
-		}
-	}
-
 	return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd)
 }