summaryrefslogtreecommitdiffhomepage
path: root/pkg/tcpip
diff options
context:
space:
mode:
authorRahat Mahmood <rahat@google.com>2019-08-01 13:57:41 -0700
committergVisor bot <gvisor-bot@google.com>2019-08-01 13:58:48 -0700
commit79511e8a50facd509b8180d0160762b510dd6196 (patch)
treeef7dbd6a36361a9e84a7287d6e54fcbae7a4edd6 /pkg/tcpip
parent0a246fab80581351309cdfe39ffeeffa00f811b1 (diff)
Implement getsockopt(TCP_INFO).
Export some readily-available fields for TCP_INFO and stub out the rest. PiperOrigin-RevId: 261191548
Diffstat (limited to 'pkg/tcpip')
-rw-r--r--pkg/tcpip/stack/stack.go30
-rw-r--r--pkg/tcpip/tcpip.go8
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go56
-rw-r--r--pkg/tcpip/transport/tcp/rcv.go27
-rw-r--r--pkg/tcpip/transport/tcp/sack_scoreboard.go4
-rw-r--r--pkg/tcpip/transport/tcp/snd.go5
6 files changed, 85 insertions, 45 deletions
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 6156c3f46..7c31cf493 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -208,6 +208,10 @@ type TCPSenderState struct {
// Cubic holds the state related to CUBIC congestion control.
Cubic TCPCubicState
+
+ // MSS is the size of the largest segment that can be sent without
+ // fragmentation.
+ MSS int
}
// TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
@@ -220,6 +224,9 @@ type TCPSACKInfo struct {
// from the peer endpoint.
ReceivedBlocks []header.SACKBlock
+ // Sacked is the current number of bytes held in the SACK scoreboard.
+ Sacked seqnum.Size
+
// MaxSACKED is the highest sequence number that has been SACKED
// by the peer.
MaxSACKED seqnum.Value
@@ -269,6 +276,14 @@ type TCPEndpointState struct {
// ID is a copy of the TransportEndpointID for the endpoint.
ID TCPEndpointID
+ // ProtocolState denotes the TCP state the endpoint is currently
+ // in, encoded in a netstack-specific manner. Should be translated
+ // to the Linux ABI before exposing to userspace.
+ ProtocolState uint32
+
+ // AMSS is the MSS advertised to the peer by this endpoint.
+ AMSS uint16
+
// SegTime denotes the absolute time when this segment was received.
SegTime time.Time
@@ -286,6 +301,18 @@ type TCPEndpointState struct {
// RcvClosed if true, indicates the endpoint has been closed for reading.
RcvClosed bool
+ // RcvLastAck is the time of receipt of the last packet with the
+ // ACK flag set.
+ RcvLastAckNanos int64
+
+ // RcvLastData is the time of reciept of the last packet
+ // containing data.
+ RcvLastDataNanos int64
+
+ // RcvMSS is the size of the largest segment the receiver is willing to
+ // accept, not including TCP headers and options.
+ RcvMSS int
+
// SendTSOk is used to indicate when the TS Option has been negotiated.
// When sendTSOk is true every non-RST segment should carry a TS as per
// RFC7323#section-1.1.
@@ -326,6 +353,9 @@ type TCPEndpointState struct {
// SndMTU is the smallest MTU seen in the control packets received.
SndMTU int
+ // MaxOptionSize is the maximum size of TCP options.
+ MaxOptionSize int
+
// Receiver holds variables related to the TCP receiver for the endpoint.
Receiver TCPReceiverState
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 4208c0303..690c00edb 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -476,14 +476,6 @@ type QuickAckOption int
// Only supported on Unix sockets.
type PasscredOption int
-// TCPInfoOption is used by GetSockOpt to expose TCP statistics.
-//
-// TODO(b/64800844): Add and populate stat fields.
-type TCPInfoOption struct {
- RTT time.Duration
- RTTVar time.Duration
-}
-
// KeepaliveEnabledOption is used by SetSockOpt/GetSockOpt to specify whether
// TCP keepalive is enabled for this socket.
type KeepaliveEnabledOption int
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index cc49c8272..e94307bd5 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -108,6 +108,9 @@ func (s EndpointState) String() string {
}
}
+// InfoOption is used by GetSockOpt to expose TCP endpoint state.
+type InfoOption stack.TCPEndpointState
+
// Reasons for notifying the protocol goroutine.
const (
notifyNonZeroReceiveWindow = 1 << iota
@@ -202,12 +205,14 @@ type endpoint struct {
// to indicate to users that no more data is coming.
//
// rcvListMu can be taken after the endpoint mu below.
- rcvListMu sync.Mutex `state:"nosave"`
- rcvList segmentList `state:"wait"`
- rcvClosed bool
- rcvBufSize int
- rcvBufUsed int
- rcvAutoParams rcvBufAutoTuneParams
+ rcvListMu sync.Mutex `state:"nosave"`
+ rcvList segmentList `state:"wait"`
+ rcvClosed bool
+ rcvBufSize int
+ rcvBufUsed int
+ rcvAutoParams rcvBufAutoTuneParams
+ rcvLastAckNanos int64 // timestamp
+ rcvLastDataNanos int64 // timestamp
// zeroWindow indicates that the window was closed due to receive buffer
// space being filled up. This is set by the worker goroutine before
// moving a segment to the rcvList. This setting is cleared by the
@@ -1198,17 +1203,10 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
}
return nil
- case *tcpip.TCPInfoOption:
- *o = tcpip.TCPInfoOption{}
- e.mu.RLock()
- snd := e.snd
- e.mu.RUnlock()
- if snd != nil {
- snd.rtt.Lock()
- o.RTT = snd.rtt.srtt
- o.RTTVar = snd.rtt.rttvar
- snd.rtt.Unlock()
- }
+ case *InfoOption:
+ e.workMu.Lock()
+ *o = InfoOption(e.completeState())
+ e.workMu.Unlock()
return nil
case *tcpip.KeepaliveEnabledOption:
@@ -1933,22 +1931,27 @@ func (e *endpoint) maxOptionSize() (size int) {
}
// completeState makes a full copy of the endpoint and returns it. This is used
-// before invoking the probe. The state returned may not be fully consistent if
-// there are intervening syscalls when the state is being copied.
+// before invoking the probe and for getsockopt(TCP_INFO). The state returned
+// may not be fully consistent if there are intervening syscalls when the state
+// is being copied.
func (e *endpoint) completeState() stack.TCPEndpointState {
var s stack.TCPEndpointState
s.SegTime = time.Now()
- // Copy EndpointID.
- e.mu.Lock()
+ e.mu.RLock()
s.ID = stack.TCPEndpointID(e.id)
- e.mu.Unlock()
+ s.ProtocolState = uint32(e.state)
+ s.AMSS = e.amss
+ s.RcvMSS = int(e.amss) - e.maxOptionSize()
+ e.mu.RUnlock()
// Copy endpoint rcv state.
e.rcvListMu.Lock()
s.RcvBufSize = e.rcvBufSize
s.RcvBufUsed = e.rcvBufUsed
s.RcvClosed = e.rcvClosed
+ s.RcvLastAckNanos = e.rcvLastAckNanos
+ s.RcvLastDataNanos = e.rcvLastDataNanos
s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime
s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied
s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied
@@ -1956,6 +1959,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber
s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime
s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled
+
e.rcvListMu.Unlock()
// Endpoint TCP Option state.
@@ -1965,7 +1969,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
s.SACKPermitted = e.sackPermitted
s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
- s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
+ s.SACK.ReceivedBlocks, s.SACK.Sacked, s.SACK.MaxSACKED = e.scoreboard.Copy()
// Copy endpoint send state.
e.sndBufMu.Lock()
@@ -2009,12 +2013,14 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
RTTMeasureTime: e.snd.rttMeasureTime,
Closed: e.snd.closed,
RTO: e.snd.rto,
+ MSS: e.snd.mss,
MaxPayloadSize: e.snd.maxPayloadSize,
SndWndScale: e.snd.sndWndScale,
MaxSentAck: e.snd.maxSentAck,
}
e.snd.rtt.Lock()
s.Sender.SRTT = e.snd.rtt.srtt
+ s.Sender.RTTVar = e.snd.rtt.rttvar
s.Sender.SRTTInited = e.snd.rtt.srttInited
e.snd.rtt.Unlock()
@@ -2059,8 +2065,8 @@ func (e *endpoint) initGSO() {
// State implements tcpip.Endpoint.State. It exports the endpoint's protocol
// state for diagnostics.
func (e *endpoint) State() uint32 {
- e.mu.Lock()
- defer e.mu.Unlock()
+ e.mu.RLock()
+ defer e.mu.RUnlock()
return uint32(e.state)
}
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index e90f9a7d9..a8f490c4a 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -220,25 +220,24 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
return true
}
-// updateRTT updates the receiver RTT measurement based on the sequence number
-// of the received segment.
-func (r *receiver) updateRTT() {
+// updateRTTLocked updates the receiver RTT measurement based on the sequence
+// number of the received segment.
+//
+// Precondition: Caller must hold r.ep.rcvListMu.
+func (r *receiver) updateRTTLocked() {
// From: https://public.lanl.gov/radiant/pubs/drs/sc2001-poster.pdf
//
// A system that is only transmitting acknowledgements can still
// estimate the round-trip time by observing the time between when a byte
// is first acknowledged and the receipt of data that is at least one
// window beyond the sequence number that was acknowledged.
- r.ep.rcvListMu.Lock()
if r.ep.rcvAutoParams.rttMeasureTime.IsZero() {
// New measurement.
r.ep.rcvAutoParams.rttMeasureTime = time.Now()
r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd)
- r.ep.rcvListMu.Unlock()
return
}
if r.rcvNxt.LessThan(r.ep.rcvAutoParams.rttMeasureSeqNumber) {
- r.ep.rcvListMu.Unlock()
return
}
rtt := time.Since(r.ep.rcvAutoParams.rttMeasureTime)
@@ -250,7 +249,6 @@ func (r *receiver) updateRTT() {
}
r.ep.rcvAutoParams.rttMeasureTime = time.Now()
r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd)
- r.ep.rcvListMu.Unlock()
}
// handleRcvdSegment handles TCP segments directed at the connection managed by
@@ -291,11 +289,20 @@ func (r *receiver) handleRcvdSegment(s *segment) {
return
}
- // Since we consumed a segment update the receiver's RTT estimate
- // if required.
+ r.ep.rcvListMu.Lock()
+ // FIXME(b/137581805): Using the runtime clock here is incorrect as it
+ // doesn't account for potentially virtualized time.
+ now := time.Now().UnixNano()
+ if s.flagIsSet(header.TCPFlagAck) {
+ r.ep.rcvLastAckNanos = now
+ }
if segLen > 0 {
- r.updateRTT()
+ // Since we consumed a segment update the receiver's RTT estimate if
+ // required.
+ r.ep.rcvLastDataNanos = now
+ r.updateRTTLocked()
}
+ r.ep.rcvListMu.Unlock()
// By consuming the current segment, we may have filled a gap in the
// sequence number domain that allows pending segments to be consumed
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go
index 7ef2df377..02e52a63b 100644
--- a/pkg/tcpip/transport/tcp/sack_scoreboard.go
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go
@@ -208,12 +208,12 @@ func (s *SACKScoreboard) Delete(seq seqnum.Value) {
}
// Copy provides a copy of the SACK scoreboard.
-func (s *SACKScoreboard) Copy() (sackBlocks []header.SACKBlock, maxSACKED seqnum.Value) {
+func (s *SACKScoreboard) Copy() (sackBlocks []header.SACKBlock, sacked seqnum.Size, maxSACKED seqnum.Value) {
s.ranges.Ascend(func(i btree.Item) bool {
sackBlocks = append(sackBlocks, i.(header.SACKBlock))
return true
})
- return sackBlocks, s.maxSACKED
+ return sackBlocks, s.sacked, s.maxSACKED
}
// IsRangeLost implements the IsLost(SeqNum) operation defined in RFC 6675
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 0fee7ab72..daf28a49a 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -124,6 +124,10 @@ type sender struct {
rtt rtt
rto time.Duration
+ // mss is the largest segment that can be sent without fragmentation.
+ // Initialized when then sender is created, read-only afterwards.
+ mss int
+
// maxPayloadSize is the maximum size of the payload of a given segment.
// It is initialized on demand.
maxPayloadSize int
@@ -201,6 +205,7 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
rto: 1 * time.Second,
rttMeasureSeqNum: iss + 1,
lastSendTime: time.Now(),
+ mss: int(mss),
maxPayloadSize: maxPayloadSize,
maxSentAck: irs + 1,
fr: fastRecovery{