diff options
Diffstat (limited to 'pkg')
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint.go | 4 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint_state.go | 3 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/sack_scoreboard.go | 27 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/segment.go | 10 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/segment_state.go | 12 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/snd.go | 51 |
6 files changed, 97 insertions, 10 deletions
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index aa31a78af..a8618bb4a 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -178,6 +178,9 @@ type endpoint struct { // cork is a boolean (0 is false) and must be accessed atomically. cork uint32 + // scoreboard holds TCP SACK Scoreboard information for this endpoint. + scoreboard *SACKScoreboard + // The options below aren't implemented, but we remember the user // settings because applications expect to be able to set/query these // options. @@ -1627,6 +1630,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState { s.SACKPermitted = e.sackPermitted s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) + s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() // Copy endpoint send state. e.sndBufMu.Lock() diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index a07cd9011..ca7852d04 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -204,6 +204,9 @@ func (e *endpoint) afterLoad() { e.connectingAddress = e.id.RemoteAddress } } + // Reset the scoreboard to reinitialize the sack information as + // we do not restore SACK information. + e.scoreboard.Reset() if err := e.connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}, false, e.workerRunning); err != tcpip.ErrConnectStarted { panic("endpoint connecting failed: " + err.String()) } diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go index 5dba8a165..67f25f050 100644 --- a/pkg/tcpip/transport/tcp/sack_scoreboard.go +++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go @@ -23,27 +23,42 @@ import ( "gvisor.googlesource.com/gvisor/pkg/tcpip/seqnum" ) -// maxSACKBlocks is the maximum number of distinct SACKBlocks the scoreboard -// will track. Once there are 100 distinct blocks, new insertions will fail. -const maxSACKBlocks = 100 +const ( + // maxSACKBlocks is the maximum number of distinct SACKBlocks the + // scoreboard will track. Once there are 100 distinct blocks, new + // insertions will fail. + maxSACKBlocks = 100 + + // defaultBtreeDegree is set to 2 as btree.New(2) results in a 2-3-4 + // tree. + defaultBtreeDegree = 2 +) // SACKScoreboard stores a set of disjoint SACK ranges. +// +// +stateify savable type SACKScoreboard struct { smss uint16 maxSACKED seqnum.Value - sacked seqnum.Size - ranges *btree.BTree + sacked seqnum.Size `state:"nosave"` + ranges *btree.BTree `state:"nosave"` } // NewSACKScoreboard returns a new SACK Scoreboard. func NewSACKScoreboard(smss uint16, iss seqnum.Value) *SACKScoreboard { return &SACKScoreboard{ smss: smss, - ranges: btree.New(2), + ranges: btree.New(defaultBtreeDegree), maxSACKED: iss, } } +// Reset erases all known range information from the SACK scoreboard. +func (s *SACKScoreboard) Reset() { + s.ranges = btree.New(defaultBtreeDegree) + s.sacked = 0 +} + // Insert inserts/merges the provided SACKBlock into the scoreboard. func (s *SACKScoreboard) Insert(r header.SACKBlock) { if s.ranges.Len() >= maxSACKBlocks { diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go index 87c6d7d20..bd8017f64 100644 --- a/pkg/tcpip/transport/tcp/segment.go +++ b/pkg/tcpip/transport/tcp/segment.go @@ -16,6 +16,7 @@ package tcp import ( "sync/atomic" + "time" "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" "gvisor.googlesource.com/gvisor/pkg/tcpip/header" @@ -56,8 +57,10 @@ type segment struct { window seqnum.Size // parsedOptions stores the parsed values from the options in the segment. - parsedOptions header.TCPOptions - options []byte `state:".([]byte)"` + parsedOptions header.TCPOptions + options []byte `state:".([]byte)"` + hasNewSACKInfo bool + rcvdTime time.Time `state:".(unixTime)"` } func newSegment(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) *segment { @@ -67,6 +70,7 @@ func newSegment(r *stack.Route, id stack.TransportEndpointID, vv buffer.Vectoris route: r.Clone(), } s.data = vv.Clone(s.views[:]) + s.rcvdTime = time.Now() return s } @@ -78,6 +82,7 @@ func newSegmentFromView(r *stack.Route, id stack.TransportEndpointID, v buffer.V } s.views[0] = v s.data = buffer.NewVectorisedView(len(v), s.views[:1]) + s.rcvdTime = time.Now() return s } @@ -91,6 +96,7 @@ func (s *segment) clone() *segment { window: s.window, route: s.route.Clone(), viewToDeliver: s.viewToDeliver, + rcvdTime: s.rcvdTime, } t.data = s.data.Clone(t.views[:]) return t diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go index d4bd6cf95..7b98a3ec8 100644 --- a/pkg/tcpip/transport/tcp/segment_state.go +++ b/pkg/tcpip/transport/tcp/segment_state.go @@ -15,6 +15,8 @@ package tcp import ( + "time" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" ) @@ -58,3 +60,13 @@ func (s *segment) loadOptions(options []byte) { // allocated so there is no cost here. s.options = options } + +// saveRcvdTime is invoked by stateify. +func (s *segment) saveRcvdTime() unixTime { + return unixTime{s.rcvdTime.Unix(), s.rcvdTime.UnixNano()} +} + +// loadRcvdTime is invoked by stateify. +func (s *segment) loadRcvdTime(unix unixTime) { + s.rcvdTime = time.Unix(unix.second, unix.nano) +} diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 5c9e76fa1..8312ae077 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -199,10 +199,12 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint s.sndWndScale = uint8(sndWndScale) } - s.updateMaxPayloadSize(int(ep.route.MTU()), 0) - + // Initialize SACK Scoreboard. + s.ep.scoreboard = NewSACKScoreboard(mss, iss) s.resendTimer.init(&s.resendWaker) + s.updateMaxPayloadSize(int(ep.route.MTU()), 0) + return s } @@ -380,6 +382,21 @@ func (s *sender) retransmitTimerExpired() bool { // We'll keep on transmitting (or retransmitting) as we get acks for // the data we transmit. s.outstanding = 0 + + // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 + // + // In order to avoid memory deadlocks, the TCP receiver is allowed to + // discard data that has already been selectively acknowledged. As a + // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK + // information gathered from a receiver upon a retransmission timeout + // (RTO) "since the timeout might indicate that the data receiver has + // reneged." Additionally, a TCP sender MUST "ignore prior SACK + // information in determining which data to retransmit." + // + // NOTE: We take the stricter interpretation and just expunge all + // information as we lack more rigorous checks to validate if the SACK + // information is usable after an RTO. + s.ep.scoreboard.Reset() s.writeNext = s.writeList.Front() s.sendData() @@ -550,6 +567,10 @@ func (s *sender) leaveFastRecovery() { // Deflate cwnd. It had been artificially inflated when new dups arrived. s.sndCwnd = s.sndSsthresh + + // As recovery is now complete, delete all SACK information for acked + // data. + s.ep.scoreboard.Delete(s.sndUna) s.cc.PostRecovery() } @@ -644,6 +665,29 @@ func (s *sender) handleRcvdSegment(seg *segment) { if s.ep.sendTSOk && seg.parsedOptions.TS { s.ep.updateRecentTimestamp(seg.parsedOptions.TSVal, s.maxSentAck, seg.sequenceNumber) } + + // Insert SACKBlock information into our scoreboard. + if s.ep.sackPermitted { + for _, sb := range seg.parsedOptions.SACKBlocks { + // Only insert the SACK block if the following holds + // true: + // * SACK block acks data after the ack number in the + // current segment. + // * SACK block represents a sequence + // between sndUna and sndNxt (i.e. data that is + // currently unacked and in-flight). + // * SACK block that has not been SACKed already. + // + // NOTE: This check specifically excludes DSACK blocks + // which have start/end before sndUna and are used to + // indicate spurious retransmissions. + if seg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) { + s.ep.scoreboard.Insert(sb) + seg.hasNewSACKInfo = true + } + } + } + // Count the duplicates and do the fast retransmit if needed. rtx := s.checkDuplicateAck(seg) @@ -702,6 +746,9 @@ func (s *sender) handleRcvdSegment(seg *segment) { // Update the send buffer usage and notify potential waiters. s.ep.updateSndBufferUsage(int(acked)) + // Clear SACK information for all acked data. + s.ep.scoreboard.Delete(s.sndUna) + // If we are not in fast recovery then update the congestion // window based on the number of acknowledged packets. if !s.fr.active { |