From 26be25e4ecec2fa66ee819e980de353d657aa1f6 Mon Sep 17 00:00:00 2001 From: Bhasker Hariharan Date: Mon, 25 Feb 2019 15:18:55 -0800 Subject: Add a SACK scoreboard to TCP endpoints. This change does not make use of SACK information but adds support to track SACK information and store it in the endpoint. The actual SACK based recovery will be in a separate CL. Part of commits to add RFC 6675 support to Netstack. PiperOrigin-RevId: 235612264 Change-Id: I261f94844d7bad5abda803152ce6cc6125a467ff --- pkg/tcpip/transport/tcp/endpoint.go | 4 +++ pkg/tcpip/transport/tcp/endpoint_state.go | 3 ++ pkg/tcpip/transport/tcp/sack_scoreboard.go | 27 ++++++++++++---- pkg/tcpip/transport/tcp/segment.go | 10 ++++-- pkg/tcpip/transport/tcp/segment_state.go | 12 +++++++ pkg/tcpip/transport/tcp/snd.go | 51 ++++++++++++++++++++++++++++-- 6 files changed, 97 insertions(+), 10 deletions(-) (limited to 'pkg/tcpip') diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index aa31a78af..a8618bb4a 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -178,6 +178,9 @@ type endpoint struct { // cork is a boolean (0 is false) and must be accessed atomically. cork uint32 + // scoreboard holds TCP SACK Scoreboard information for this endpoint. + scoreboard *SACKScoreboard + // The options below aren't implemented, but we remember the user // settings because applications expect to be able to set/query these // options. @@ -1627,6 +1630,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState { s.SACKPermitted = e.sackPermitted s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) + s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() // Copy endpoint send state. e.sndBufMu.Lock() diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index a07cd9011..ca7852d04 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -204,6 +204,9 @@ func (e *endpoint) afterLoad() { e.connectingAddress = e.id.RemoteAddress } } + // Reset the scoreboard to reinitialize the sack information as + // we do not restore SACK information. + e.scoreboard.Reset() if err := e.connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}, false, e.workerRunning); err != tcpip.ErrConnectStarted { panic("endpoint connecting failed: " + err.String()) } diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go index 5dba8a165..67f25f050 100644 --- a/pkg/tcpip/transport/tcp/sack_scoreboard.go +++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go @@ -23,27 +23,42 @@ import ( "gvisor.googlesource.com/gvisor/pkg/tcpip/seqnum" ) -// maxSACKBlocks is the maximum number of distinct SACKBlocks the scoreboard -// will track. Once there are 100 distinct blocks, new insertions will fail. -const maxSACKBlocks = 100 +const ( + // maxSACKBlocks is the maximum number of distinct SACKBlocks the + // scoreboard will track. Once there are 100 distinct blocks, new + // insertions will fail. + maxSACKBlocks = 100 + + // defaultBtreeDegree is set to 2 as btree.New(2) results in a 2-3-4 + // tree. + defaultBtreeDegree = 2 +) // SACKScoreboard stores a set of disjoint SACK ranges. +// +// +stateify savable type SACKScoreboard struct { smss uint16 maxSACKED seqnum.Value - sacked seqnum.Size - ranges *btree.BTree + sacked seqnum.Size `state:"nosave"` + ranges *btree.BTree `state:"nosave"` } // NewSACKScoreboard returns a new SACK Scoreboard. func NewSACKScoreboard(smss uint16, iss seqnum.Value) *SACKScoreboard { return &SACKScoreboard{ smss: smss, - ranges: btree.New(2), + ranges: btree.New(defaultBtreeDegree), maxSACKED: iss, } } +// Reset erases all known range information from the SACK scoreboard. +func (s *SACKScoreboard) Reset() { + s.ranges = btree.New(defaultBtreeDegree) + s.sacked = 0 +} + // Insert inserts/merges the provided SACKBlock into the scoreboard. func (s *SACKScoreboard) Insert(r header.SACKBlock) { if s.ranges.Len() >= maxSACKBlocks { diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go index 87c6d7d20..bd8017f64 100644 --- a/pkg/tcpip/transport/tcp/segment.go +++ b/pkg/tcpip/transport/tcp/segment.go @@ -16,6 +16,7 @@ package tcp import ( "sync/atomic" + "time" "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" "gvisor.googlesource.com/gvisor/pkg/tcpip/header" @@ -56,8 +57,10 @@ type segment struct { window seqnum.Size // parsedOptions stores the parsed values from the options in the segment. - parsedOptions header.TCPOptions - options []byte `state:".([]byte)"` + parsedOptions header.TCPOptions + options []byte `state:".([]byte)"` + hasNewSACKInfo bool + rcvdTime time.Time `state:".(unixTime)"` } func newSegment(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) *segment { @@ -67,6 +70,7 @@ func newSegment(r *stack.Route, id stack.TransportEndpointID, vv buffer.Vectoris route: r.Clone(), } s.data = vv.Clone(s.views[:]) + s.rcvdTime = time.Now() return s } @@ -78,6 +82,7 @@ func newSegmentFromView(r *stack.Route, id stack.TransportEndpointID, v buffer.V } s.views[0] = v s.data = buffer.NewVectorisedView(len(v), s.views[:1]) + s.rcvdTime = time.Now() return s } @@ -91,6 +96,7 @@ func (s *segment) clone() *segment { window: s.window, route: s.route.Clone(), viewToDeliver: s.viewToDeliver, + rcvdTime: s.rcvdTime, } t.data = s.data.Clone(t.views[:]) return t diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go index d4bd6cf95..7b98a3ec8 100644 --- a/pkg/tcpip/transport/tcp/segment_state.go +++ b/pkg/tcpip/transport/tcp/segment_state.go @@ -15,6 +15,8 @@ package tcp import ( + "time" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" ) @@ -58,3 +60,13 @@ func (s *segment) loadOptions(options []byte) { // allocated so there is no cost here. s.options = options } + +// saveRcvdTime is invoked by stateify. +func (s *segment) saveRcvdTime() unixTime { + return unixTime{s.rcvdTime.Unix(), s.rcvdTime.UnixNano()} +} + +// loadRcvdTime is invoked by stateify. +func (s *segment) loadRcvdTime(unix unixTime) { + s.rcvdTime = time.Unix(unix.second, unix.nano) +} diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 5c9e76fa1..8312ae077 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -199,10 +199,12 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint s.sndWndScale = uint8(sndWndScale) } - s.updateMaxPayloadSize(int(ep.route.MTU()), 0) - + // Initialize SACK Scoreboard. + s.ep.scoreboard = NewSACKScoreboard(mss, iss) s.resendTimer.init(&s.resendWaker) + s.updateMaxPayloadSize(int(ep.route.MTU()), 0) + return s } @@ -380,6 +382,21 @@ func (s *sender) retransmitTimerExpired() bool { // We'll keep on transmitting (or retransmitting) as we get acks for // the data we transmit. s.outstanding = 0 + + // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 + // + // In order to avoid memory deadlocks, the TCP receiver is allowed to + // discard data that has already been selectively acknowledged. As a + // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK + // information gathered from a receiver upon a retransmission timeout + // (RTO) "since the timeout might indicate that the data receiver has + // reneged." Additionally, a TCP sender MUST "ignore prior SACK + // information in determining which data to retransmit." + // + // NOTE: We take the stricter interpretation and just expunge all + // information as we lack more rigorous checks to validate if the SACK + // information is usable after an RTO. + s.ep.scoreboard.Reset() s.writeNext = s.writeList.Front() s.sendData() @@ -550,6 +567,10 @@ func (s *sender) leaveFastRecovery() { // Deflate cwnd. It had been artificially inflated when new dups arrived. s.sndCwnd = s.sndSsthresh + + // As recovery is now complete, delete all SACK information for acked + // data. + s.ep.scoreboard.Delete(s.sndUna) s.cc.PostRecovery() } @@ -644,6 +665,29 @@ func (s *sender) handleRcvdSegment(seg *segment) { if s.ep.sendTSOk && seg.parsedOptions.TS { s.ep.updateRecentTimestamp(seg.parsedOptions.TSVal, s.maxSentAck, seg.sequenceNumber) } + + // Insert SACKBlock information into our scoreboard. + if s.ep.sackPermitted { + for _, sb := range seg.parsedOptions.SACKBlocks { + // Only insert the SACK block if the following holds + // true: + // * SACK block acks data after the ack number in the + // current segment. + // * SACK block represents a sequence + // between sndUna and sndNxt (i.e. data that is + // currently unacked and in-flight). + // * SACK block that has not been SACKed already. + // + // NOTE: This check specifically excludes DSACK blocks + // which have start/end before sndUna and are used to + // indicate spurious retransmissions. + if seg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) { + s.ep.scoreboard.Insert(sb) + seg.hasNewSACKInfo = true + } + } + } + // Count the duplicates and do the fast retransmit if needed. rtx := s.checkDuplicateAck(seg) @@ -702,6 +746,9 @@ func (s *sender) handleRcvdSegment(seg *segment) { // Update the send buffer usage and notify potential waiters. s.ep.updateSndBufferUsage(int(acked)) + // Clear SACK information for all acked data. + s.ep.scoreboard.Delete(s.sndUna) + // If we are not in fast recovery then update the congestion // window based on the number of acknowledged packets. if !s.fr.active { -- cgit v1.2.3