summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/socket/epsocket/epsocket.go131
-rw-r--r--pkg/tcpip/stack/stack.go30
-rw-r--r--pkg/tcpip/tcpip.go8
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go56
-rw-r--r--pkg/tcpip/transport/tcp/rcv.go27
-rw-r--r--pkg/tcpip/transport/tcp/sack_scoreboard.go4
-rw-r--r--pkg/tcpip/transport/tcp/snd.go5
-rw-r--r--test/syscalls/linux/socket_ip_tcp_generic.cc23
8 files changed, 78 insertions, 206 deletions
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 586523d3d..0f483faa8 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -845,68 +845,6 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
return nil, syserr.ErrProtocolNotAvailable
}
-func toLinuxTCPInfo(i tcp.InfoOption) linux.TCPInfo {
- // Unimplemented fields are explicitly initialized to zero below.
- return linux.TCPInfo{
- State: uint8(translateTCPState(tcp.EndpointState(i.ProtocolState))),
- CaState: 0,
- Retransmits: 0,
- Probes: 0,
- Backoff: 0,
- Options: 0,
- WindowScale: uint8((i.Sender.SndWndScale&0xf)<<4 | (i.Receiver.RcvWndScale & 0xf)),
- DeliveryRateAppLimited: 0,
-
- RTO: uint32(i.Sender.RTO / time.Microsecond),
- ATO: 0,
- SndMss: uint32(i.Sender.MSS),
- RcvMss: uint32(i.RcvMSS),
-
- Unacked: uint32(i.Sender.Outstanding),
- Sacked: uint32(i.SACK.Sacked),
- Lost: 0,
- Retrans: 0,
- Fackets: 0,
-
- LastDataSent: uint32(i.Sender.LastSendTime.UnixNano() / int64(time.Millisecond)),
- LastAckSent: 0, // Not tracked by Linux.
- LastDataRecv: uint32(i.RcvLastDataNanos / int64(time.Millisecond)),
- LastAckRecv: uint32(i.RcvLastAckNanos / int64(time.Millisecond)),
-
- PMTU: uint32(i.SndMTU),
- RcvSsthresh: 0,
- RTT: uint32(i.Sender.SRTT / time.Microsecond),
- RTTVar: uint32(i.Sender.RTTVar / time.Microsecond),
- SndSsthresh: uint32(i.Sender.Ssthresh),
- SndCwnd: uint32(i.Sender.SndCwnd),
- Advmss: uint32(i.AMSS),
- Reordering: 0,
-
- RcvRTT: uint32(i.RcvAutoParams.RTT / time.Microsecond),
- RcvSpace: uint32(i.RcvBufSize),
-
- TotalRetrans: 0,
-
- PacingRate: 0,
- MaxPacingRate: 0,
- BytesAcked: 0,
- BytesReceived: 0,
- SegsOut: 0,
- SegsIn: 0,
-
- NotSentBytes: 0,
- MinRTT: uint32(i.RcvAutoParams.RTT / time.Microsecond),
- DataSegsIn: 0,
- DataSegsOut: 0,
-
- DeliveryRate: 0,
-
- BusyTime: 0,
- RwndLimited: 0,
- SndBufLimited: 0,
- }
-}
-
// getSockOptTCP implements GetSockOpt when level is SOL_TCP.
func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
switch name {
@@ -986,14 +924,17 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
return int32(time.Duration(v) / time.Second), nil
case linux.TCP_INFO:
- var v tcp.InfoOption
+ var v tcpip.TCPInfoOption
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- info := toLinuxTCPInfo(v)
+
+ // TODO(b/64800844): Translate fields once they are added to
+ // tcpip.TCPInfoOption.
+ info := linux.TCPInfo{}
// Linux truncates the output binary to outLen.
- ib := binary.Marshal(nil, usermem.ByteOrder, info)
+ ib := binary.Marshal(nil, usermem.ByteOrder, &info)
if len(ib) > outLen {
ib = ib[:outLen]
}
@@ -2434,38 +2375,6 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
return rv
}
-// translateTCPState translates an internal endpoint state to the equivalent
-// state in the Linux ABI.
-func translateTCPState(s tcp.EndpointState) uint32 {
- switch s {
- case tcp.StateEstablished:
- return linux.TCP_ESTABLISHED
- case tcp.StateSynSent:
- return linux.TCP_SYN_SENT
- case tcp.StateSynRecv:
- return linux.TCP_SYN_RECV
- case tcp.StateFinWait1:
- return linux.TCP_FIN_WAIT1
- case tcp.StateFinWait2:
- return linux.TCP_FIN_WAIT2
- case tcp.StateTimeWait:
- return linux.TCP_TIME_WAIT
- case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
- return linux.TCP_CLOSE
- case tcp.StateCloseWait:
- return linux.TCP_CLOSE_WAIT
- case tcp.StateLastAck:
- return linux.TCP_LAST_ACK
- case tcp.StateListen:
- return linux.TCP_LISTEN
- case tcp.StateClosing:
- return linux.TCP_CLOSING
- default:
- // Internal or unknown state.
- return 0
- }
-}
-
// State implements socket.Socket.State. State translates the internal state
// returned by netstack to values defined by Linux.
func (s *SocketOperations) State() uint32 {
@@ -2476,7 +2385,33 @@ func (s *SocketOperations) State() uint32 {
if !s.isPacketBased() {
// TCP socket.
- return translateTCPState(tcp.EndpointState(s.Endpoint.State()))
+ switch tcp.EndpointState(s.Endpoint.State()) {
+ case tcp.StateEstablished:
+ return linux.TCP_ESTABLISHED
+ case tcp.StateSynSent:
+ return linux.TCP_SYN_SENT
+ case tcp.StateSynRecv:
+ return linux.TCP_SYN_RECV
+ case tcp.StateFinWait1:
+ return linux.TCP_FIN_WAIT1
+ case tcp.StateFinWait2:
+ return linux.TCP_FIN_WAIT2
+ case tcp.StateTimeWait:
+ return linux.TCP_TIME_WAIT
+ case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
+ return linux.TCP_CLOSE
+ case tcp.StateCloseWait:
+ return linux.TCP_CLOSE_WAIT
+ case tcp.StateLastAck:
+ return linux.TCP_LAST_ACK
+ case tcp.StateListen:
+ return linux.TCP_LISTEN
+ case tcp.StateClosing:
+ return linux.TCP_CLOSING
+ default:
+ // Internal or unknown state.
+ return 0
+ }
}
// TODO(b/112063468): Export states for UDP, ICMP, and raw sockets.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 7c31cf493..6156c3f46 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -208,10 +208,6 @@ type TCPSenderState struct {
// Cubic holds the state related to CUBIC congestion control.
Cubic TCPCubicState
-
- // MSS is the size of the largest segment that can be sent without
- // fragmentation.
- MSS int
}
// TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
@@ -224,9 +220,6 @@ type TCPSACKInfo struct {
// from the peer endpoint.
ReceivedBlocks []header.SACKBlock
- // Sacked is the current number of bytes held in the SACK scoreboard.
- Sacked seqnum.Size
-
// MaxSACKED is the highest sequence number that has been SACKED
// by the peer.
MaxSACKED seqnum.Value
@@ -276,14 +269,6 @@ type TCPEndpointState struct {
// ID is a copy of the TransportEndpointID for the endpoint.
ID TCPEndpointID
- // ProtocolState denotes the TCP state the endpoint is currently
- // in, encoded in a netstack-specific manner. Should be translated
- // to the Linux ABI before exposing to userspace.
- ProtocolState uint32
-
- // AMSS is the MSS advertised to the peer by this endpoint.
- AMSS uint16
-
// SegTime denotes the absolute time when this segment was received.
SegTime time.Time
@@ -301,18 +286,6 @@ type TCPEndpointState struct {
// RcvClosed if true, indicates the endpoint has been closed for reading.
RcvClosed bool
- // RcvLastAck is the time of receipt of the last packet with the
- // ACK flag set.
- RcvLastAckNanos int64
-
- // RcvLastData is the time of reciept of the last packet
- // containing data.
- RcvLastDataNanos int64
-
- // RcvMSS is the size of the largest segment the receiver is willing to
- // accept, not including TCP headers and options.
- RcvMSS int
-
// SendTSOk is used to indicate when the TS Option has been negotiated.
// When sendTSOk is true every non-RST segment should carry a TS as per
// RFC7323#section-1.1.
@@ -353,9 +326,6 @@ type TCPEndpointState struct {
// SndMTU is the smallest MTU seen in the control packets received.
SndMTU int
- // MaxOptionSize is the maximum size of TCP options.
- MaxOptionSize int
-
// Receiver holds variables related to the TCP receiver for the endpoint.
Receiver TCPReceiverState
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 690c00edb..4208c0303 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -476,6 +476,14 @@ type QuickAckOption int
// Only supported on Unix sockets.
type PasscredOption int
+// TCPInfoOption is used by GetSockOpt to expose TCP statistics.
+//
+// TODO(b/64800844): Add and populate stat fields.
+type TCPInfoOption struct {
+ RTT time.Duration
+ RTTVar time.Duration
+}
+
// KeepaliveEnabledOption is used by SetSockOpt/GetSockOpt to specify whether
// TCP keepalive is enabled for this socket.
type KeepaliveEnabledOption int
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index e94307bd5..cc49c8272 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -108,9 +108,6 @@ func (s EndpointState) String() string {
}
}
-// InfoOption is used by GetSockOpt to expose TCP endpoint state.
-type InfoOption stack.TCPEndpointState
-
// Reasons for notifying the protocol goroutine.
const (
notifyNonZeroReceiveWindow = 1 << iota
@@ -205,14 +202,12 @@ type endpoint struct {
// to indicate to users that no more data is coming.
//
// rcvListMu can be taken after the endpoint mu below.
- rcvListMu sync.Mutex `state:"nosave"`
- rcvList segmentList `state:"wait"`
- rcvClosed bool
- rcvBufSize int
- rcvBufUsed int
- rcvAutoParams rcvBufAutoTuneParams
- rcvLastAckNanos int64 // timestamp
- rcvLastDataNanos int64 // timestamp
+ rcvListMu sync.Mutex `state:"nosave"`
+ rcvList segmentList `state:"wait"`
+ rcvClosed bool
+ rcvBufSize int
+ rcvBufUsed int
+ rcvAutoParams rcvBufAutoTuneParams
// zeroWindow indicates that the window was closed due to receive buffer
// space being filled up. This is set by the worker goroutine before
// moving a segment to the rcvList. This setting is cleared by the
@@ -1203,10 +1198,17 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
}
return nil
- case *InfoOption:
- e.workMu.Lock()
- *o = InfoOption(e.completeState())
- e.workMu.Unlock()
+ case *tcpip.TCPInfoOption:
+ *o = tcpip.TCPInfoOption{}
+ e.mu.RLock()
+ snd := e.snd
+ e.mu.RUnlock()
+ if snd != nil {
+ snd.rtt.Lock()
+ o.RTT = snd.rtt.srtt
+ o.RTTVar = snd.rtt.rttvar
+ snd.rtt.Unlock()
+ }
return nil
case *tcpip.KeepaliveEnabledOption:
@@ -1931,27 +1933,22 @@ func (e *endpoint) maxOptionSize() (size int) {
}
// completeState makes a full copy of the endpoint and returns it. This is used
-// before invoking the probe and for getsockopt(TCP_INFO). The state returned
-// may not be fully consistent if there are intervening syscalls when the state
-// is being copied.
+// before invoking the probe. The state returned may not be fully consistent if
+// there are intervening syscalls when the state is being copied.
func (e *endpoint) completeState() stack.TCPEndpointState {
var s stack.TCPEndpointState
s.SegTime = time.Now()
- e.mu.RLock()
+ // Copy EndpointID.
+ e.mu.Lock()
s.ID = stack.TCPEndpointID(e.id)
- s.ProtocolState = uint32(e.state)
- s.AMSS = e.amss
- s.RcvMSS = int(e.amss) - e.maxOptionSize()
- e.mu.RUnlock()
+ e.mu.Unlock()
// Copy endpoint rcv state.
e.rcvListMu.Lock()
s.RcvBufSize = e.rcvBufSize
s.RcvBufUsed = e.rcvBufUsed
s.RcvClosed = e.rcvClosed
- s.RcvLastAckNanos = e.rcvLastAckNanos
- s.RcvLastDataNanos = e.rcvLastDataNanos
s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime
s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied
s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied
@@ -1959,7 +1956,6 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber
s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime
s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled
-
e.rcvListMu.Unlock()
// Endpoint TCP Option state.
@@ -1969,7 +1965,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
s.SACKPermitted = e.sackPermitted
s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
- s.SACK.ReceivedBlocks, s.SACK.Sacked, s.SACK.MaxSACKED = e.scoreboard.Copy()
+ s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
// Copy endpoint send state.
e.sndBufMu.Lock()
@@ -2013,14 +2009,12 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
RTTMeasureTime: e.snd.rttMeasureTime,
Closed: e.snd.closed,
RTO: e.snd.rto,
- MSS: e.snd.mss,
MaxPayloadSize: e.snd.maxPayloadSize,
SndWndScale: e.snd.sndWndScale,
MaxSentAck: e.snd.maxSentAck,
}
e.snd.rtt.Lock()
s.Sender.SRTT = e.snd.rtt.srtt
- s.Sender.RTTVar = e.snd.rtt.rttvar
s.Sender.SRTTInited = e.snd.rtt.srttInited
e.snd.rtt.Unlock()
@@ -2065,8 +2059,8 @@ func (e *endpoint) initGSO() {
// State implements tcpip.Endpoint.State. It exports the endpoint's protocol
// state for diagnostics.
func (e *endpoint) State() uint32 {
- e.mu.RLock()
- defer e.mu.RUnlock()
+ e.mu.Lock()
+ defer e.mu.Unlock()
return uint32(e.state)
}
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index a8f490c4a..e90f9a7d9 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -220,24 +220,25 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
return true
}
-// updateRTTLocked updates the receiver RTT measurement based on the sequence
-// number of the received segment.
-//
-// Precondition: Caller must hold r.ep.rcvListMu.
-func (r *receiver) updateRTTLocked() {
+// updateRTT updates the receiver RTT measurement based on the sequence number
+// of the received segment.
+func (r *receiver) updateRTT() {
// From: https://public.lanl.gov/radiant/pubs/drs/sc2001-poster.pdf
//
// A system that is only transmitting acknowledgements can still
// estimate the round-trip time by observing the time between when a byte
// is first acknowledged and the receipt of data that is at least one
// window beyond the sequence number that was acknowledged.
+ r.ep.rcvListMu.Lock()
if r.ep.rcvAutoParams.rttMeasureTime.IsZero() {
// New measurement.
r.ep.rcvAutoParams.rttMeasureTime = time.Now()
r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd)
+ r.ep.rcvListMu.Unlock()
return
}
if r.rcvNxt.LessThan(r.ep.rcvAutoParams.rttMeasureSeqNumber) {
+ r.ep.rcvListMu.Unlock()
return
}
rtt := time.Since(r.ep.rcvAutoParams.rttMeasureTime)
@@ -249,6 +250,7 @@ func (r *receiver) updateRTTLocked() {
}
r.ep.rcvAutoParams.rttMeasureTime = time.Now()
r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd)
+ r.ep.rcvListMu.Unlock()
}
// handleRcvdSegment handles TCP segments directed at the connection managed by
@@ -289,20 +291,11 @@ func (r *receiver) handleRcvdSegment(s *segment) {
return
}
- r.ep.rcvListMu.Lock()
- // FIXME(b/137581805): Using the runtime clock here is incorrect as it
- // doesn't account for potentially virtualized time.
- now := time.Now().UnixNano()
- if s.flagIsSet(header.TCPFlagAck) {
- r.ep.rcvLastAckNanos = now
- }
+ // Since we consumed a segment update the receiver's RTT estimate
+ // if required.
if segLen > 0 {
- // Since we consumed a segment update the receiver's RTT estimate if
- // required.
- r.ep.rcvLastDataNanos = now
- r.updateRTTLocked()
+ r.updateRTT()
}
- r.ep.rcvListMu.Unlock()
// By consuming the current segment, we may have filled a gap in the
// sequence number domain that allows pending segments to be consumed
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go
index 02e52a63b..7ef2df377 100644
--- a/pkg/tcpip/transport/tcp/sack_scoreboard.go
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go
@@ -208,12 +208,12 @@ func (s *SACKScoreboard) Delete(seq seqnum.Value) {
}
// Copy provides a copy of the SACK scoreboard.
-func (s *SACKScoreboard) Copy() (sackBlocks []header.SACKBlock, sacked seqnum.Size, maxSACKED seqnum.Value) {
+func (s *SACKScoreboard) Copy() (sackBlocks []header.SACKBlock, maxSACKED seqnum.Value) {
s.ranges.Ascend(func(i btree.Item) bool {
sackBlocks = append(sackBlocks, i.(header.SACKBlock))
return true
})
- return sackBlocks, s.sacked, s.maxSACKED
+ return sackBlocks, s.maxSACKED
}
// IsRangeLost implements the IsLost(SeqNum) operation defined in RFC 6675
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index daf28a49a..0fee7ab72 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -124,10 +124,6 @@ type sender struct {
rtt rtt
rto time.Duration
- // mss is the largest segment that can be sent without fragmentation.
- // Initialized when then sender is created, read-only afterwards.
- mss int
-
// maxPayloadSize is the maximum size of the payload of a given segment.
// It is initialized on demand.
maxPayloadSize int
@@ -205,7 +201,6 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
rto: 1 * time.Second,
rttMeasureSeqNum: iss + 1,
lastSendTime: time.Now(),
- mss: int(mss),
maxPayloadSize: maxPayloadSize,
maxSentAck: irs + 1,
fr: fastRecovery{
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 01987d2f0..a43cf9bce 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -697,28 +697,5 @@ TEST_P(TCPSocketPairTest, SetCongestionControlFailsForUnsupported) {
EXPECT_EQ(0, memcmp(got_cc, old_cc, sizeof(old_cc)));
}
-TEST_P(TCPSocketPairTest, GetSockOptTCPInfo) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
- struct tcp_info info;
- socklen_t optlen = sizeof(info);
- ASSERT_THAT(
- getsockopt(sockets->first_fd(), SOL_TCP, TCP_INFO, &info, &optlen),
- SyscallSucceedsWithValue(0));
- EXPECT_EQ(optlen, sizeof(info));
-
- EXPECT_EQ(info.tcpi_state, TCP_ESTABLISHED);
-
- EXPECT_GT(info.tcpi_rto, 0);
-
- // IPv4 MSS is 536 bytes by default, and IPv6 is 1220 bytes.
- EXPECT_GE(info.tcpi_snd_mss, 536);
- EXPECT_GE(info.tcpi_rcv_mss, 536);
- EXPECT_GE(info.tcpi_advmss, 536);
-
- // MTU is typically 1500 for ethernet, but this is highly protocol
- // dependent. Opt for a safe lower bound.
- EXPECT_GT(info.tcpi_pmtu, 500);
-}
-
} // namespace testing
} // namespace gvisor