summaryrefslogtreecommitdiffhomepage
path: root/pkg/tcpip/transport/tcp
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/tcpip/transport/tcp')
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go4
-rw-r--r--pkg/tcpip/transport/tcp/endpoint_state.go3
-rw-r--r--pkg/tcpip/transport/tcp/sack_scoreboard.go27
-rw-r--r--pkg/tcpip/transport/tcp/segment.go10
-rw-r--r--pkg/tcpip/transport/tcp/segment_state.go12
-rw-r--r--pkg/tcpip/transport/tcp/snd.go51
6 files changed, 97 insertions, 10 deletions
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index aa31a78af..a8618bb4a 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -178,6 +178,9 @@ type endpoint struct {
// cork is a boolean (0 is false) and must be accessed atomically.
cork uint32
+ // scoreboard holds TCP SACK Scoreboard information for this endpoint.
+ scoreboard *SACKScoreboard
+
// The options below aren't implemented, but we remember the user
// settings because applications expect to be able to set/query these
// options.
@@ -1627,6 +1630,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
s.SACKPermitted = e.sackPermitted
s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
+ s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
// Copy endpoint send state.
e.sndBufMu.Lock()
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index a07cd9011..ca7852d04 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -204,6 +204,9 @@ func (e *endpoint) afterLoad() {
e.connectingAddress = e.id.RemoteAddress
}
}
+ // Reset the scoreboard to reinitialize the sack information as
+ // we do not restore SACK information.
+ e.scoreboard.Reset()
if err := e.connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}, false, e.workerRunning); err != tcpip.ErrConnectStarted {
panic("endpoint connecting failed: " + err.String())
}
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go
index 5dba8a165..67f25f050 100644
--- a/pkg/tcpip/transport/tcp/sack_scoreboard.go
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go
@@ -23,27 +23,42 @@ import (
"gvisor.googlesource.com/gvisor/pkg/tcpip/seqnum"
)
-// maxSACKBlocks is the maximum number of distinct SACKBlocks the scoreboard
-// will track. Once there are 100 distinct blocks, new insertions will fail.
-const maxSACKBlocks = 100
+const (
+ // maxSACKBlocks is the maximum number of distinct SACKBlocks the
+ // scoreboard will track. Once there are 100 distinct blocks, new
+ // insertions will fail.
+ maxSACKBlocks = 100
+
+ // defaultBtreeDegree is set to 2 as btree.New(2) results in a 2-3-4
+ // tree.
+ defaultBtreeDegree = 2
+)
// SACKScoreboard stores a set of disjoint SACK ranges.
+//
+// +stateify savable
type SACKScoreboard struct {
smss uint16
maxSACKED seqnum.Value
- sacked seqnum.Size
- ranges *btree.BTree
+ sacked seqnum.Size `state:"nosave"`
+ ranges *btree.BTree `state:"nosave"`
}
// NewSACKScoreboard returns a new SACK Scoreboard.
func NewSACKScoreboard(smss uint16, iss seqnum.Value) *SACKScoreboard {
return &SACKScoreboard{
smss: smss,
- ranges: btree.New(2),
+ ranges: btree.New(defaultBtreeDegree),
maxSACKED: iss,
}
}
+// Reset erases all known range information from the SACK scoreboard.
+func (s *SACKScoreboard) Reset() {
+ s.ranges = btree.New(defaultBtreeDegree)
+ s.sacked = 0
+}
+
// Insert inserts/merges the provided SACKBlock into the scoreboard.
func (s *SACKScoreboard) Insert(r header.SACKBlock) {
if s.ranges.Len() >= maxSACKBlocks {
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 87c6d7d20..bd8017f64 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -16,6 +16,7 @@ package tcp
import (
"sync/atomic"
+ "time"
"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
@@ -56,8 +57,10 @@ type segment struct {
window seqnum.Size
// parsedOptions stores the parsed values from the options in the segment.
- parsedOptions header.TCPOptions
- options []byte `state:".([]byte)"`
+ parsedOptions header.TCPOptions
+ options []byte `state:".([]byte)"`
+ hasNewSACKInfo bool
+ rcvdTime time.Time `state:".(unixTime)"`
}
func newSegment(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) *segment {
@@ -67,6 +70,7 @@ func newSegment(r *stack.Route, id stack.TransportEndpointID, vv buffer.Vectoris
route: r.Clone(),
}
s.data = vv.Clone(s.views[:])
+ s.rcvdTime = time.Now()
return s
}
@@ -78,6 +82,7 @@ func newSegmentFromView(r *stack.Route, id stack.TransportEndpointID, v buffer.V
}
s.views[0] = v
s.data = buffer.NewVectorisedView(len(v), s.views[:1])
+ s.rcvdTime = time.Now()
return s
}
@@ -91,6 +96,7 @@ func (s *segment) clone() *segment {
window: s.window,
route: s.route.Clone(),
viewToDeliver: s.viewToDeliver,
+ rcvdTime: s.rcvdTime,
}
t.data = s.data.Clone(t.views[:])
return t
diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go
index d4bd6cf95..7b98a3ec8 100644
--- a/pkg/tcpip/transport/tcp/segment_state.go
+++ b/pkg/tcpip/transport/tcp/segment_state.go
@@ -15,6 +15,8 @@
package tcp
import (
+ "time"
+
"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
)
@@ -58,3 +60,13 @@ func (s *segment) loadOptions(options []byte) {
// allocated so there is no cost here.
s.options = options
}
+
+// saveRcvdTime is invoked by stateify.
+func (s *segment) saveRcvdTime() unixTime {
+ return unixTime{s.rcvdTime.Unix(), s.rcvdTime.UnixNano()}
+}
+
+// loadRcvdTime is invoked by stateify.
+func (s *segment) loadRcvdTime(unix unixTime) {
+ s.rcvdTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 5c9e76fa1..8312ae077 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -199,10 +199,12 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
s.sndWndScale = uint8(sndWndScale)
}
- s.updateMaxPayloadSize(int(ep.route.MTU()), 0)
-
+ // Initialize SACK Scoreboard.
+ s.ep.scoreboard = NewSACKScoreboard(mss, iss)
s.resendTimer.init(&s.resendWaker)
+ s.updateMaxPayloadSize(int(ep.route.MTU()), 0)
+
return s
}
@@ -380,6 +382,21 @@ func (s *sender) retransmitTimerExpired() bool {
// We'll keep on transmitting (or retransmitting) as we get acks for
// the data we transmit.
s.outstanding = 0
+
+ // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1
+ //
+ // In order to avoid memory deadlocks, the TCP receiver is allowed to
+ // discard data that has already been selectively acknowledged. As a
+ // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK
+ // information gathered from a receiver upon a retransmission timeout
+ // (RTO) "since the timeout might indicate that the data receiver has
+ // reneged." Additionally, a TCP sender MUST "ignore prior SACK
+ // information in determining which data to retransmit."
+ //
+ // NOTE: We take the stricter interpretation and just expunge all
+ // information as we lack more rigorous checks to validate if the SACK
+ // information is usable after an RTO.
+ s.ep.scoreboard.Reset()
s.writeNext = s.writeList.Front()
s.sendData()
@@ -550,6 +567,10 @@ func (s *sender) leaveFastRecovery() {
// Deflate cwnd. It had been artificially inflated when new dups arrived.
s.sndCwnd = s.sndSsthresh
+
+ // As recovery is now complete, delete all SACK information for acked
+ // data.
+ s.ep.scoreboard.Delete(s.sndUna)
s.cc.PostRecovery()
}
@@ -644,6 +665,29 @@ func (s *sender) handleRcvdSegment(seg *segment) {
if s.ep.sendTSOk && seg.parsedOptions.TS {
s.ep.updateRecentTimestamp(seg.parsedOptions.TSVal, s.maxSentAck, seg.sequenceNumber)
}
+
+ // Insert SACKBlock information into our scoreboard.
+ if s.ep.sackPermitted {
+ for _, sb := range seg.parsedOptions.SACKBlocks {
+ // Only insert the SACK block if the following holds
+ // true:
+ // * SACK block acks data after the ack number in the
+ // current segment.
+ // * SACK block represents a sequence
+ // between sndUna and sndNxt (i.e. data that is
+ // currently unacked and in-flight).
+ // * SACK block that has not been SACKed already.
+ //
+ // NOTE: This check specifically excludes DSACK blocks
+ // which have start/end before sndUna and are used to
+ // indicate spurious retransmissions.
+ if seg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
+ s.ep.scoreboard.Insert(sb)
+ seg.hasNewSACKInfo = true
+ }
+ }
+ }
+
// Count the duplicates and do the fast retransmit if needed.
rtx := s.checkDuplicateAck(seg)
@@ -702,6 +746,9 @@ func (s *sender) handleRcvdSegment(seg *segment) {
// Update the send buffer usage and notify potential waiters.
s.ep.updateSndBufferUsage(int(acked))
+ // Clear SACK information for all acked data.
+ s.ep.scoreboard.Delete(s.sndUna)
+
// If we are not in fast recovery then update the congestion
// window based on the number of acknowledged packets.
if !s.fr.active {