diff options
-rw-r--r-- | pkg/sentry/socket/epsocket/epsocket.go | 131 | ||||
-rw-r--r-- | pkg/tcpip/stack/stack.go | 30 | ||||
-rw-r--r-- | pkg/tcpip/tcpip.go | 8 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint.go | 56 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/rcv.go | 27 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/sack_scoreboard.go | 4 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/snd.go | 5 | ||||
-rw-r--r-- | test/syscalls/linux/socket_ip_tcp_generic.cc | 23 |
8 files changed, 78 insertions, 206 deletions
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go index 586523d3d..0f483faa8 100644 --- a/pkg/sentry/socket/epsocket/epsocket.go +++ b/pkg/sentry/socket/epsocket/epsocket.go @@ -845,68 +845,6 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family return nil, syserr.ErrProtocolNotAvailable } -func toLinuxTCPInfo(i tcp.InfoOption) linux.TCPInfo { - // Unimplemented fields are explicitly initialized to zero below. - return linux.TCPInfo{ - State: uint8(translateTCPState(tcp.EndpointState(i.ProtocolState))), - CaState: 0, - Retransmits: 0, - Probes: 0, - Backoff: 0, - Options: 0, - WindowScale: uint8((i.Sender.SndWndScale&0xf)<<4 | (i.Receiver.RcvWndScale & 0xf)), - DeliveryRateAppLimited: 0, - - RTO: uint32(i.Sender.RTO / time.Microsecond), - ATO: 0, - SndMss: uint32(i.Sender.MSS), - RcvMss: uint32(i.RcvMSS), - - Unacked: uint32(i.Sender.Outstanding), - Sacked: uint32(i.SACK.Sacked), - Lost: 0, - Retrans: 0, - Fackets: 0, - - LastDataSent: uint32(i.Sender.LastSendTime.UnixNano() / int64(time.Millisecond)), - LastAckSent: 0, // Not tracked by Linux. - LastDataRecv: uint32(i.RcvLastDataNanos / int64(time.Millisecond)), - LastAckRecv: uint32(i.RcvLastAckNanos / int64(time.Millisecond)), - - PMTU: uint32(i.SndMTU), - RcvSsthresh: 0, - RTT: uint32(i.Sender.SRTT / time.Microsecond), - RTTVar: uint32(i.Sender.RTTVar / time.Microsecond), - SndSsthresh: uint32(i.Sender.Ssthresh), - SndCwnd: uint32(i.Sender.SndCwnd), - Advmss: uint32(i.AMSS), - Reordering: 0, - - RcvRTT: uint32(i.RcvAutoParams.RTT / time.Microsecond), - RcvSpace: uint32(i.RcvBufSize), - - TotalRetrans: 0, - - PacingRate: 0, - MaxPacingRate: 0, - BytesAcked: 0, - BytesReceived: 0, - SegsOut: 0, - SegsIn: 0, - - NotSentBytes: 0, - MinRTT: uint32(i.RcvAutoParams.RTT / time.Microsecond), - DataSegsIn: 0, - DataSegsOut: 0, - - DeliveryRate: 0, - - BusyTime: 0, - RwndLimited: 0, - SndBufLimited: 0, - } -} - // getSockOptTCP implements GetSockOpt when level is SOL_TCP. func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) { switch name { @@ -986,14 +924,17 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return int32(time.Duration(v) / time.Second), nil case linux.TCP_INFO: - var v tcp.InfoOption + var v tcpip.TCPInfoOption if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } - info := toLinuxTCPInfo(v) + + // TODO(b/64800844): Translate fields once they are added to + // tcpip.TCPInfoOption. + info := linux.TCPInfo{} // Linux truncates the output binary to outLen. - ib := binary.Marshal(nil, usermem.ByteOrder, info) + ib := binary.Marshal(nil, usermem.ByteOrder, &info) if len(ib) > outLen { ib = ib[:outLen] } @@ -2434,38 +2375,6 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 { return rv } -// translateTCPState translates an internal endpoint state to the equivalent -// state in the Linux ABI. -func translateTCPState(s tcp.EndpointState) uint32 { - switch s { - case tcp.StateEstablished: - return linux.TCP_ESTABLISHED - case tcp.StateSynSent: - return linux.TCP_SYN_SENT - case tcp.StateSynRecv: - return linux.TCP_SYN_RECV - case tcp.StateFinWait1: - return linux.TCP_FIN_WAIT1 - case tcp.StateFinWait2: - return linux.TCP_FIN_WAIT2 - case tcp.StateTimeWait: - return linux.TCP_TIME_WAIT - case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError: - return linux.TCP_CLOSE - case tcp.StateCloseWait: - return linux.TCP_CLOSE_WAIT - case tcp.StateLastAck: - return linux.TCP_LAST_ACK - case tcp.StateListen: - return linux.TCP_LISTEN - case tcp.StateClosing: - return linux.TCP_CLOSING - default: - // Internal or unknown state. - return 0 - } -} - // State implements socket.Socket.State. State translates the internal state // returned by netstack to values defined by Linux. func (s *SocketOperations) State() uint32 { @@ -2476,7 +2385,33 @@ func (s *SocketOperations) State() uint32 { if !s.isPacketBased() { // TCP socket. - return translateTCPState(tcp.EndpointState(s.Endpoint.State())) + switch tcp.EndpointState(s.Endpoint.State()) { + case tcp.StateEstablished: + return linux.TCP_ESTABLISHED + case tcp.StateSynSent: + return linux.TCP_SYN_SENT + case tcp.StateSynRecv: + return linux.TCP_SYN_RECV + case tcp.StateFinWait1: + return linux.TCP_FIN_WAIT1 + case tcp.StateFinWait2: + return linux.TCP_FIN_WAIT2 + case tcp.StateTimeWait: + return linux.TCP_TIME_WAIT + case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError: + return linux.TCP_CLOSE + case tcp.StateCloseWait: + return linux.TCP_CLOSE_WAIT + case tcp.StateLastAck: + return linux.TCP_LAST_ACK + case tcp.StateListen: + return linux.TCP_LISTEN + case tcp.StateClosing: + return linux.TCP_CLOSING + default: + // Internal or unknown state. + return 0 + } } // TODO(b/112063468): Export states for UDP, ICMP, and raw sockets. diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 7c31cf493..6156c3f46 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -208,10 +208,6 @@ type TCPSenderState struct { // Cubic holds the state related to CUBIC congestion control. Cubic TCPCubicState - - // MSS is the size of the largest segment that can be sent without - // fragmentation. - MSS int } // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint. @@ -224,9 +220,6 @@ type TCPSACKInfo struct { // from the peer endpoint. ReceivedBlocks []header.SACKBlock - // Sacked is the current number of bytes held in the SACK scoreboard. - Sacked seqnum.Size - // MaxSACKED is the highest sequence number that has been SACKED // by the peer. MaxSACKED seqnum.Value @@ -276,14 +269,6 @@ type TCPEndpointState struct { // ID is a copy of the TransportEndpointID for the endpoint. ID TCPEndpointID - // ProtocolState denotes the TCP state the endpoint is currently - // in, encoded in a netstack-specific manner. Should be translated - // to the Linux ABI before exposing to userspace. - ProtocolState uint32 - - // AMSS is the MSS advertised to the peer by this endpoint. - AMSS uint16 - // SegTime denotes the absolute time when this segment was received. SegTime time.Time @@ -301,18 +286,6 @@ type TCPEndpointState struct { // RcvClosed if true, indicates the endpoint has been closed for reading. RcvClosed bool - // RcvLastAck is the time of receipt of the last packet with the - // ACK flag set. - RcvLastAckNanos int64 - - // RcvLastData is the time of reciept of the last packet - // containing data. - RcvLastDataNanos int64 - - // RcvMSS is the size of the largest segment the receiver is willing to - // accept, not including TCP headers and options. - RcvMSS int - // SendTSOk is used to indicate when the TS Option has been negotiated. // When sendTSOk is true every non-RST segment should carry a TS as per // RFC7323#section-1.1. @@ -353,9 +326,6 @@ type TCPEndpointState struct { // SndMTU is the smallest MTU seen in the control packets received. SndMTU int - // MaxOptionSize is the maximum size of TCP options. - MaxOptionSize int - // Receiver holds variables related to the TCP receiver for the endpoint. Receiver TCPReceiverState diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 690c00edb..4208c0303 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -476,6 +476,14 @@ type QuickAckOption int // Only supported on Unix sockets. type PasscredOption int +// TCPInfoOption is used by GetSockOpt to expose TCP statistics. +// +// TODO(b/64800844): Add and populate stat fields. +type TCPInfoOption struct { + RTT time.Duration + RTTVar time.Duration +} + // KeepaliveEnabledOption is used by SetSockOpt/GetSockOpt to specify whether // TCP keepalive is enabled for this socket. type KeepaliveEnabledOption int diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index e94307bd5..cc49c8272 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -108,9 +108,6 @@ func (s EndpointState) String() string { } } -// InfoOption is used by GetSockOpt to expose TCP endpoint state. -type InfoOption stack.TCPEndpointState - // Reasons for notifying the protocol goroutine. const ( notifyNonZeroReceiveWindow = 1 << iota @@ -205,14 +202,12 @@ type endpoint struct { // to indicate to users that no more data is coming. // // rcvListMu can be taken after the endpoint mu below. - rcvListMu sync.Mutex `state:"nosave"` - rcvList segmentList `state:"wait"` - rcvClosed bool - rcvBufSize int - rcvBufUsed int - rcvAutoParams rcvBufAutoTuneParams - rcvLastAckNanos int64 // timestamp - rcvLastDataNanos int64 // timestamp + rcvListMu sync.Mutex `state:"nosave"` + rcvList segmentList `state:"wait"` + rcvClosed bool + rcvBufSize int + rcvBufUsed int + rcvAutoParams rcvBufAutoTuneParams // zeroWindow indicates that the window was closed due to receive buffer // space being filled up. This is set by the worker goroutine before // moving a segment to the rcvList. This setting is cleared by the @@ -1203,10 +1198,17 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { } return nil - case *InfoOption: - e.workMu.Lock() - *o = InfoOption(e.completeState()) - e.workMu.Unlock() + case *tcpip.TCPInfoOption: + *o = tcpip.TCPInfoOption{} + e.mu.RLock() + snd := e.snd + e.mu.RUnlock() + if snd != nil { + snd.rtt.Lock() + o.RTT = snd.rtt.srtt + o.RTTVar = snd.rtt.rttvar + snd.rtt.Unlock() + } return nil case *tcpip.KeepaliveEnabledOption: @@ -1931,27 +1933,22 @@ func (e *endpoint) maxOptionSize() (size int) { } // completeState makes a full copy of the endpoint and returns it. This is used -// before invoking the probe and for getsockopt(TCP_INFO). The state returned -// may not be fully consistent if there are intervening syscalls when the state -// is being copied. +// before invoking the probe. The state returned may not be fully consistent if +// there are intervening syscalls when the state is being copied. func (e *endpoint) completeState() stack.TCPEndpointState { var s stack.TCPEndpointState s.SegTime = time.Now() - e.mu.RLock() + // Copy EndpointID. + e.mu.Lock() s.ID = stack.TCPEndpointID(e.id) - s.ProtocolState = uint32(e.state) - s.AMSS = e.amss - s.RcvMSS = int(e.amss) - e.maxOptionSize() - e.mu.RUnlock() + e.mu.Unlock() // Copy endpoint rcv state. e.rcvListMu.Lock() s.RcvBufSize = e.rcvBufSize s.RcvBufUsed = e.rcvBufUsed s.RcvClosed = e.rcvClosed - s.RcvLastAckNanos = e.rcvLastAckNanos - s.RcvLastDataNanos = e.rcvLastDataNanos s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied @@ -1959,7 +1956,6 @@ func (e *endpoint) completeState() stack.TCPEndpointState { s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled - e.rcvListMu.Unlock() // Endpoint TCP Option state. @@ -1969,7 +1965,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState { s.SACKPermitted = e.sackPermitted s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) - s.SACK.ReceivedBlocks, s.SACK.Sacked, s.SACK.MaxSACKED = e.scoreboard.Copy() + s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() // Copy endpoint send state. e.sndBufMu.Lock() @@ -2013,14 +2009,12 @@ func (e *endpoint) completeState() stack.TCPEndpointState { RTTMeasureTime: e.snd.rttMeasureTime, Closed: e.snd.closed, RTO: e.snd.rto, - MSS: e.snd.mss, MaxPayloadSize: e.snd.maxPayloadSize, SndWndScale: e.snd.sndWndScale, MaxSentAck: e.snd.maxSentAck, } e.snd.rtt.Lock() s.Sender.SRTT = e.snd.rtt.srtt - s.Sender.RTTVar = e.snd.rtt.rttvar s.Sender.SRTTInited = e.snd.rtt.srttInited e.snd.rtt.Unlock() @@ -2065,8 +2059,8 @@ func (e *endpoint) initGSO() { // State implements tcpip.Endpoint.State. It exports the endpoint's protocol // state for diagnostics. func (e *endpoint) State() uint32 { - e.mu.RLock() - defer e.mu.RUnlock() + e.mu.Lock() + defer e.mu.Unlock() return uint32(e.state) } diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go index a8f490c4a..e90f9a7d9 100644 --- a/pkg/tcpip/transport/tcp/rcv.go +++ b/pkg/tcpip/transport/tcp/rcv.go @@ -220,24 +220,25 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum return true } -// updateRTTLocked updates the receiver RTT measurement based on the sequence -// number of the received segment. -// -// Precondition: Caller must hold r.ep.rcvListMu. -func (r *receiver) updateRTTLocked() { +// updateRTT updates the receiver RTT measurement based on the sequence number +// of the received segment. +func (r *receiver) updateRTT() { // From: https://public.lanl.gov/radiant/pubs/drs/sc2001-poster.pdf // // A system that is only transmitting acknowledgements can still // estimate the round-trip time by observing the time between when a byte // is first acknowledged and the receipt of data that is at least one // window beyond the sequence number that was acknowledged. + r.ep.rcvListMu.Lock() if r.ep.rcvAutoParams.rttMeasureTime.IsZero() { // New measurement. r.ep.rcvAutoParams.rttMeasureTime = time.Now() r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd) + r.ep.rcvListMu.Unlock() return } if r.rcvNxt.LessThan(r.ep.rcvAutoParams.rttMeasureSeqNumber) { + r.ep.rcvListMu.Unlock() return } rtt := time.Since(r.ep.rcvAutoParams.rttMeasureTime) @@ -249,6 +250,7 @@ func (r *receiver) updateRTTLocked() { } r.ep.rcvAutoParams.rttMeasureTime = time.Now() r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd) + r.ep.rcvListMu.Unlock() } // handleRcvdSegment handles TCP segments directed at the connection managed by @@ -289,20 +291,11 @@ func (r *receiver) handleRcvdSegment(s *segment) { return } - r.ep.rcvListMu.Lock() - // FIXME(b/137581805): Using the runtime clock here is incorrect as it - // doesn't account for potentially virtualized time. - now := time.Now().UnixNano() - if s.flagIsSet(header.TCPFlagAck) { - r.ep.rcvLastAckNanos = now - } + // Since we consumed a segment update the receiver's RTT estimate + // if required. if segLen > 0 { - // Since we consumed a segment update the receiver's RTT estimate if - // required. - r.ep.rcvLastDataNanos = now - r.updateRTTLocked() + r.updateRTT() } - r.ep.rcvListMu.Unlock() // By consuming the current segment, we may have filled a gap in the // sequence number domain that allows pending segments to be consumed diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go index 02e52a63b..7ef2df377 100644 --- a/pkg/tcpip/transport/tcp/sack_scoreboard.go +++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go @@ -208,12 +208,12 @@ func (s *SACKScoreboard) Delete(seq seqnum.Value) { } // Copy provides a copy of the SACK scoreboard. -func (s *SACKScoreboard) Copy() (sackBlocks []header.SACKBlock, sacked seqnum.Size, maxSACKED seqnum.Value) { +func (s *SACKScoreboard) Copy() (sackBlocks []header.SACKBlock, maxSACKED seqnum.Value) { s.ranges.Ascend(func(i btree.Item) bool { sackBlocks = append(sackBlocks, i.(header.SACKBlock)) return true }) - return sackBlocks, s.sacked, s.maxSACKED + return sackBlocks, s.maxSACKED } // IsRangeLost implements the IsLost(SeqNum) operation defined in RFC 6675 diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index daf28a49a..0fee7ab72 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -124,10 +124,6 @@ type sender struct { rtt rtt rto time.Duration - // mss is the largest segment that can be sent without fragmentation. - // Initialized when then sender is created, read-only afterwards. - mss int - // maxPayloadSize is the maximum size of the payload of a given segment. // It is initialized on demand. maxPayloadSize int @@ -205,7 +201,6 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint rto: 1 * time.Second, rttMeasureSeqNum: iss + 1, lastSendTime: time.Now(), - mss: int(mss), maxPayloadSize: maxPayloadSize, maxSentAck: irs + 1, fr: fastRecovery{ diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc index 01987d2f0..a43cf9bce 100644 --- a/test/syscalls/linux/socket_ip_tcp_generic.cc +++ b/test/syscalls/linux/socket_ip_tcp_generic.cc @@ -697,28 +697,5 @@ TEST_P(TCPSocketPairTest, SetCongestionControlFailsForUnsupported) { EXPECT_EQ(0, memcmp(got_cc, old_cc, sizeof(old_cc))); } -TEST_P(TCPSocketPairTest, GetSockOptTCPInfo) { - auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); - struct tcp_info info; - socklen_t optlen = sizeof(info); - ASSERT_THAT( - getsockopt(sockets->first_fd(), SOL_TCP, TCP_INFO, &info, &optlen), - SyscallSucceedsWithValue(0)); - EXPECT_EQ(optlen, sizeof(info)); - - EXPECT_EQ(info.tcpi_state, TCP_ESTABLISHED); - - EXPECT_GT(info.tcpi_rto, 0); - - // IPv4 MSS is 536 bytes by default, and IPv6 is 1220 bytes. - EXPECT_GE(info.tcpi_snd_mss, 536); - EXPECT_GE(info.tcpi_rcv_mss, 536); - EXPECT_GE(info.tcpi_advmss, 536); - - // MTU is typically 1500 for ethernet, but this is highly protocol - // dependent. Opt for a safe lower bound. - EXPECT_GT(info.tcpi_pmtu, 500); -} - } // namespace testing } // namespace gvisor |