summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/socket/epsocket/epsocket.go131
-rw-r--r--pkg/tcpip/stack/stack.go30
-rw-r--r--pkg/tcpip/tcpip.go8
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go56
-rw-r--r--pkg/tcpip/transport/tcp/rcv.go27
-rw-r--r--pkg/tcpip/transport/tcp/sack_scoreboard.go4
-rw-r--r--pkg/tcpip/transport/tcp/snd.go5
-rw-r--r--test/syscalls/linux/socket_ip_tcp_generic.cc23
8 files changed, 206 insertions, 78 deletions
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 0f483faa8..586523d3d 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -845,6 +845,68 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
return nil, syserr.ErrProtocolNotAvailable
}
+func toLinuxTCPInfo(i tcp.InfoOption) linux.TCPInfo {
+ // Unimplemented fields are explicitly initialized to zero below.
+ return linux.TCPInfo{
+ State: uint8(translateTCPState(tcp.EndpointState(i.ProtocolState))),
+ CaState: 0,
+ Retransmits: 0,
+ Probes: 0,
+ Backoff: 0,
+ Options: 0,
+ WindowScale: uint8((i.Sender.SndWndScale&0xf)<<4 | (i.Receiver.RcvWndScale & 0xf)),
+ DeliveryRateAppLimited: 0,
+
+ RTO: uint32(i.Sender.RTO / time.Microsecond),
+ ATO: 0,
+ SndMss: uint32(i.Sender.MSS),
+ RcvMss: uint32(i.RcvMSS),
+
+ Unacked: uint32(i.Sender.Outstanding),
+ Sacked: uint32(i.SACK.Sacked),
+ Lost: 0,
+ Retrans: 0,
+ Fackets: 0,
+
+ LastDataSent: uint32(i.Sender.LastSendTime.UnixNano() / int64(time.Millisecond)),
+ LastAckSent: 0, // Not tracked by Linux.
+ LastDataRecv: uint32(i.RcvLastDataNanos / int64(time.Millisecond)),
+ LastAckRecv: uint32(i.RcvLastAckNanos / int64(time.Millisecond)),
+
+ PMTU: uint32(i.SndMTU),
+ RcvSsthresh: 0,
+ RTT: uint32(i.Sender.SRTT / time.Microsecond),
+ RTTVar: uint32(i.Sender.RTTVar / time.Microsecond),
+ SndSsthresh: uint32(i.Sender.Ssthresh),
+ SndCwnd: uint32(i.Sender.SndCwnd),
+ Advmss: uint32(i.AMSS),
+ Reordering: 0,
+
+ RcvRTT: uint32(i.RcvAutoParams.RTT / time.Microsecond),
+ RcvSpace: uint32(i.RcvBufSize),
+
+ TotalRetrans: 0,
+
+ PacingRate: 0,
+ MaxPacingRate: 0,
+ BytesAcked: 0,
+ BytesReceived: 0,
+ SegsOut: 0,
+ SegsIn: 0,
+
+ NotSentBytes: 0,
+ MinRTT: uint32(i.RcvAutoParams.RTT / time.Microsecond),
+ DataSegsIn: 0,
+ DataSegsOut: 0,
+
+ DeliveryRate: 0,
+
+ BusyTime: 0,
+ RwndLimited: 0,
+ SndBufLimited: 0,
+ }
+}
+
// getSockOptTCP implements GetSockOpt when level is SOL_TCP.
func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
switch name {
@@ -924,17 +986,14 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
return int32(time.Duration(v) / time.Second), nil
case linux.TCP_INFO:
- var v tcpip.TCPInfoOption
+ var v tcp.InfoOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- // TODO(b/64800844): Translate fields once they are added to
- // tcpip.TCPInfoOption.
- info := linux.TCPInfo{}
+ info := toLinuxTCPInfo(v)
// Linux truncates the output binary to outLen.
- ib := binary.Marshal(nil, usermem.ByteOrder, &info)
+ ib := binary.Marshal(nil, usermem.ByteOrder, info)
if len(ib) > outLen {
ib = ib[:outLen]
}
@@ -2375,6 +2434,38 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
return rv
}
+// translateTCPState translates an internal endpoint state to the equivalent
+// state in the Linux ABI.
+func translateTCPState(s tcp.EndpointState) uint32 {
+ switch s {
+ case tcp.StateEstablished:
+ return linux.TCP_ESTABLISHED
+ case tcp.StateSynSent:
+ return linux.TCP_SYN_SENT
+ case tcp.StateSynRecv:
+ return linux.TCP_SYN_RECV
+ case tcp.StateFinWait1:
+ return linux.TCP_FIN_WAIT1
+ case tcp.StateFinWait2:
+ return linux.TCP_FIN_WAIT2
+ case tcp.StateTimeWait:
+ return linux.TCP_TIME_WAIT
+ case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
+ return linux.TCP_CLOSE
+ case tcp.StateCloseWait:
+ return linux.TCP_CLOSE_WAIT
+ case tcp.StateLastAck:
+ return linux.TCP_LAST_ACK
+ case tcp.StateListen:
+ return linux.TCP_LISTEN
+ case tcp.StateClosing:
+ return linux.TCP_CLOSING
+ default:
+ // Internal or unknown state.
+ return 0
+ }
+}
+
// State implements socket.Socket.State. State translates the internal state
// returned by netstack to values defined by Linux.
func (s *SocketOperations) State() uint32 {
@@ -2385,33 +2476,7 @@ func (s *SocketOperations) State() uint32 {
if !s.isPacketBased() {
// TCP socket.
- switch tcp.EndpointState(s.Endpoint.State()) {
- case tcp.StateEstablished:
- return linux.TCP_ESTABLISHED
- case tcp.StateSynSent:
- return linux.TCP_SYN_SENT
- case tcp.StateSynRecv:
- return linux.TCP_SYN_RECV
- case tcp.StateFinWait1:
- return linux.TCP_FIN_WAIT1
- case tcp.StateFinWait2:
- return linux.TCP_FIN_WAIT2
- case tcp.StateTimeWait:
- return linux.TCP_TIME_WAIT
- case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
- return linux.TCP_CLOSE
- case tcp.StateCloseWait:
- return linux.TCP_CLOSE_WAIT
- case tcp.StateLastAck:
- return linux.TCP_LAST_ACK
- case tcp.StateListen:
- return linux.TCP_LISTEN
- case tcp.StateClosing:
- return linux.TCP_CLOSING
- default:
- // Internal or unknown state.
- return 0
- }
+ return translateTCPState(tcp.EndpointState(s.Endpoint.State()))
}
// TODO(b/112063468): Export states for UDP, ICMP, and raw sockets.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 6156c3f46..7c31cf493 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -208,6 +208,10 @@ type TCPSenderState struct {
// Cubic holds the state related to CUBIC congestion control.
Cubic TCPCubicState
+
+ // MSS is the size of the largest segment that can be sent without
+ // fragmentation.
+ MSS int
}
// TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
@@ -220,6 +224,9 @@ type TCPSACKInfo struct {
// from the peer endpoint.
ReceivedBlocks []header.SACKBlock
+ // Sacked is the current number of bytes held in the SACK scoreboard.
+ Sacked seqnum.Size
+
// MaxSACKED is the highest sequence number that has been SACKED
// by the peer.
MaxSACKED seqnum.Value
@@ -269,6 +276,14 @@ type TCPEndpointState struct {
// ID is a copy of the TransportEndpointID for the endpoint.
ID TCPEndpointID
+ // ProtocolState denotes the TCP state the endpoint is currently
+ // in, encoded in a netstack-specific manner. Should be translated
+ // to the Linux ABI before exposing to userspace.
+ ProtocolState uint32
+
+ // AMSS is the MSS advertised to the peer by this endpoint.
+ AMSS uint16
+
// SegTime denotes the absolute time when this segment was received.
SegTime time.Time
@@ -286,6 +301,18 @@ type TCPEndpointState struct {
// RcvClosed if true, indicates the endpoint has been closed for reading.
RcvClosed bool
+ // RcvLastAck is the time of receipt of the last packet with the
+ // ACK flag set.
+ RcvLastAckNanos int64
+
+ // RcvLastData is the time of reciept of the last packet
+ // containing data.
+ RcvLastDataNanos int64
+
+ // RcvMSS is the size of the largest segment the receiver is willing to
+ // accept, not including TCP headers and options.
+ RcvMSS int
+
// SendTSOk is used to indicate when the TS Option has been negotiated.
// When sendTSOk is true every non-RST segment should carry a TS as per
// RFC7323#section-1.1.
@@ -326,6 +353,9 @@ type TCPEndpointState struct {
// SndMTU is the smallest MTU seen in the control packets received.
SndMTU int
+ // MaxOptionSize is the maximum size of TCP options.
+ MaxOptionSize int
+
// Receiver holds variables related to the TCP receiver for the endpoint.
Receiver TCPReceiverState
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 4208c0303..690c00edb 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -476,14 +476,6 @@ type QuickAckOption int
// Only supported on Unix sockets.
type PasscredOption int
-// TCPInfoOption is used by GetSockOpt to expose TCP statistics.
-//
-// TODO(b/64800844): Add and populate stat fields.
-type TCPInfoOption struct {
- RTT time.Duration
- RTTVar time.Duration
-}
-
// KeepaliveEnabledOption is used by SetSockOpt/GetSockOpt to specify whether
// TCP keepalive is enabled for this socket.
type KeepaliveEnabledOption int
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index cc49c8272..e94307bd5 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -108,6 +108,9 @@ func (s EndpointState) String() string {
}
}
+// InfoOption is used by GetSockOpt to expose TCP endpoint state.
+type InfoOption stack.TCPEndpointState
+
// Reasons for notifying the protocol goroutine.
const (
notifyNonZeroReceiveWindow = 1 << iota
@@ -202,12 +205,14 @@ type endpoint struct {
// to indicate to users that no more data is coming.
//
// rcvListMu can be taken after the endpoint mu below.
- rcvListMu sync.Mutex `state:"nosave"`
- rcvList segmentList `state:"wait"`
- rcvClosed bool
- rcvBufSize int
- rcvBufUsed int
- rcvAutoParams rcvBufAutoTuneParams
+ rcvListMu sync.Mutex `state:"nosave"`
+ rcvList segmentList `state:"wait"`
+ rcvClosed bool
+ rcvBufSize int
+ rcvBufUsed int
+ rcvAutoParams rcvBufAutoTuneParams
+ rcvLastAckNanos int64 // timestamp
+ rcvLastDataNanos int64 // timestamp
// zeroWindow indicates that the window was closed due to receive buffer
// space being filled up. This is set by the worker goroutine before
// moving a segment to the rcvList. This setting is cleared by the
@@ -1198,17 +1203,10 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
}
return nil
- case *tcpip.TCPInfoOption:
- *o = tcpip.TCPInfoOption{}
- e.mu.RLock()
- snd := e.snd
- e.mu.RUnlock()
- if snd != nil {
- snd.rtt.Lock()
- o.RTT = snd.rtt.srtt
- o.RTTVar = snd.rtt.rttvar
- snd.rtt.Unlock()
- }
+ case *InfoOption:
+ e.workMu.Lock()
+ *o = InfoOption(e.completeState())
+ e.workMu.Unlock()
return nil
case *tcpip.KeepaliveEnabledOption:
@@ -1933,22 +1931,27 @@ func (e *endpoint) maxOptionSize() (size int) {
}
// completeState makes a full copy of the endpoint and returns it. This is used
-// before invoking the probe. The state returned may not be fully consistent if
-// there are intervening syscalls when the state is being copied.
+// before invoking the probe and for getsockopt(TCP_INFO). The state returned
+// may not be fully consistent if there are intervening syscalls when the state
+// is being copied.
func (e *endpoint) completeState() stack.TCPEndpointState {
var s stack.TCPEndpointState
s.SegTime = time.Now()
- // Copy EndpointID.
- e.mu.Lock()
+ e.mu.RLock()
s.ID = stack.TCPEndpointID(e.id)
- e.mu.Unlock()
+ s.ProtocolState = uint32(e.state)
+ s.AMSS = e.amss
+ s.RcvMSS = int(e.amss) - e.maxOptionSize()
+ e.mu.RUnlock()
// Copy endpoint rcv state.
e.rcvListMu.Lock()
s.RcvBufSize = e.rcvBufSize
s.RcvBufUsed = e.rcvBufUsed
s.RcvClosed = e.rcvClosed
+ s.RcvLastAckNanos = e.rcvLastAckNanos
+ s.RcvLastDataNanos = e.rcvLastDataNanos
s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime
s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied
s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied
@@ -1956,6 +1959,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber
s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime
s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled
+
e.rcvListMu.Unlock()
// Endpoint TCP Option state.
@@ -1965,7 +1969,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
s.SACKPermitted = e.sackPermitted
s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
- s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
+ s.SACK.ReceivedBlocks, s.SACK.Sacked, s.SACK.MaxSACKED = e.scoreboard.Copy()
// Copy endpoint send state.
e.sndBufMu.Lock()
@@ -2009,12 +2013,14 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
RTTMeasureTime: e.snd.rttMeasureTime,
Closed: e.snd.closed,
RTO: e.snd.rto,
+ MSS: e.snd.mss,
MaxPayloadSize: e.snd.maxPayloadSize,
SndWndScale: e.snd.sndWndScale,
MaxSentAck: e.snd.maxSentAck,
}
e.snd.rtt.Lock()
s.Sender.SRTT = e.snd.rtt.srtt
+ s.Sender.RTTVar = e.snd.rtt.rttvar
s.Sender.SRTTInited = e.snd.rtt.srttInited
e.snd.rtt.Unlock()
@@ -2059,8 +2065,8 @@ func (e *endpoint) initGSO() {
// State implements tcpip.Endpoint.State. It exports the endpoint's protocol
// state for diagnostics.
func (e *endpoint) State() uint32 {
- e.mu.Lock()
- defer e.mu.Unlock()
+ e.mu.RLock()
+ defer e.mu.RUnlock()
return uint32(e.state)
}
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index e90f9a7d9..a8f490c4a 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -220,25 +220,24 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
return true
}
-// updateRTT updates the receiver RTT measurement based on the sequence number
-// of the received segment.
-func (r *receiver) updateRTT() {
+// updateRTTLocked updates the receiver RTT measurement based on the sequence
+// number of the received segment.
+//
+// Precondition: Caller must hold r.ep.rcvListMu.
+func (r *receiver) updateRTTLocked() {
// From: https://public.lanl.gov/radiant/pubs/drs/sc2001-poster.pdf
//
// A system that is only transmitting acknowledgements can still
// estimate the round-trip time by observing the time between when a byte
// is first acknowledged and the receipt of data that is at least one
// window beyond the sequence number that was acknowledged.
- r.ep.rcvListMu.Lock()
if r.ep.rcvAutoParams.rttMeasureTime.IsZero() {
// New measurement.
r.ep.rcvAutoParams.rttMeasureTime = time.Now()
r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd)
- r.ep.rcvListMu.Unlock()
return
}
if r.rcvNxt.LessThan(r.ep.rcvAutoParams.rttMeasureSeqNumber) {
- r.ep.rcvListMu.Unlock()
return
}
rtt := time.Since(r.ep.rcvAutoParams.rttMeasureTime)
@@ -250,7 +249,6 @@ func (r *receiver) updateRTT() {
}
r.ep.rcvAutoParams.rttMeasureTime = time.Now()
r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd)
- r.ep.rcvListMu.Unlock()
}
// handleRcvdSegment handles TCP segments directed at the connection managed by
@@ -291,11 +289,20 @@ func (r *receiver) handleRcvdSegment(s *segment) {
return
}
- // Since we consumed a segment update the receiver's RTT estimate
- // if required.
+ r.ep.rcvListMu.Lock()
+ // FIXME(b/137581805): Using the runtime clock here is incorrect as it
+ // doesn't account for potentially virtualized time.
+ now := time.Now().UnixNano()
+ if s.flagIsSet(header.TCPFlagAck) {
+ r.ep.rcvLastAckNanos = now
+ }
if segLen > 0 {
- r.updateRTT()
+ // Since we consumed a segment update the receiver's RTT estimate if
+ // required.
+ r.ep.rcvLastDataNanos = now
+ r.updateRTTLocked()
}
+ r.ep.rcvListMu.Unlock()
// By consuming the current segment, we may have filled a gap in the
// sequence number domain that allows pending segments to be consumed
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go
index 7ef2df377..02e52a63b 100644
--- a/pkg/tcpip/transport/tcp/sack_scoreboard.go
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go
@@ -208,12 +208,12 @@ func (s *SACKScoreboard) Delete(seq seqnum.Value) {
}
// Copy provides a copy of the SACK scoreboard.
-func (s *SACKScoreboard) Copy() (sackBlocks []header.SACKBlock, maxSACKED seqnum.Value) {
+func (s *SACKScoreboard) Copy() (sackBlocks []header.SACKBlock, sacked seqnum.Size, maxSACKED seqnum.Value) {
s.ranges.Ascend(func(i btree.Item) bool {
sackBlocks = append(sackBlocks, i.(header.SACKBlock))
return true
})
- return sackBlocks, s.maxSACKED
+ return sackBlocks, s.sacked, s.maxSACKED
}
// IsRangeLost implements the IsLost(SeqNum) operation defined in RFC 6675
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 0fee7ab72..daf28a49a 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -124,6 +124,10 @@ type sender struct {
rtt rtt
rto time.Duration
+ // mss is the largest segment that can be sent without fragmentation.
+ // Initialized when then sender is created, read-only afterwards.
+ mss int
+
// maxPayloadSize is the maximum size of the payload of a given segment.
// It is initialized on demand.
maxPayloadSize int
@@ -201,6 +205,7 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
rto: 1 * time.Second,
rttMeasureSeqNum: iss + 1,
lastSendTime: time.Now(),
+ mss: int(mss),
maxPayloadSize: maxPayloadSize,
maxSentAck: irs + 1,
fr: fastRecovery{
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index a43cf9bce..01987d2f0 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -697,5 +697,28 @@ TEST_P(TCPSocketPairTest, SetCongestionControlFailsForUnsupported) {
EXPECT_EQ(0, memcmp(got_cc, old_cc, sizeof(old_cc)));
}
+TEST_P(TCPSocketPairTest, GetSockOptTCPInfo) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ struct tcp_info info;
+ socklen_t optlen = sizeof(info);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_TCP, TCP_INFO, &info, &optlen),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(optlen, sizeof(info));
+
+ EXPECT_EQ(info.tcpi_state, TCP_ESTABLISHED);
+
+ EXPECT_GT(info.tcpi_rto, 0);
+
+ // IPv4 MSS is 536 bytes by default, and IPv6 is 1220 bytes.
+ EXPECT_GE(info.tcpi_snd_mss, 536);
+ EXPECT_GE(info.tcpi_rcv_mss, 536);
+ EXPECT_GE(info.tcpi_advmss, 536);
+
+ // MTU is typically 1500 for ethernet, but this is highly protocol
+ // dependent. Opt for a safe lower bound.
+ EXPECT_GT(info.tcpi_pmtu, 500);
+}
+
} // namespace testing
} // namespace gvisor