diff options
-rw-r--r-- | pkg/abi/linux/socket.go | 12 | ||||
-rw-r--r-- | pkg/abi/linux/tcp.go | 9 | ||||
-rw-r--r-- | pkg/sentry/socket/netstack/netstack.go | 23 | ||||
-rw-r--r-- | pkg/tcpip/tcpip.go | 44 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint.go | 34 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/rack.go | 2 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/snd.go | 36 | ||||
-rw-r--r-- | test/packetimpact/runner/defs.bzl | 3 | ||||
-rw-r--r-- | test/packetimpact/tests/BUILD | 13 | ||||
-rw-r--r-- | test/packetimpact/tests/tcp_info_test.go | 103 | ||||
-rw-r--r-- | test/syscalls/linux/socket_ip_tcp_generic.cc | 27 |
11 files changed, 265 insertions, 41 deletions
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go index 8591acbf2..cb33c37bd 100644 --- a/pkg/abi/linux/socket.go +++ b/pkg/abi/linux/socket.go @@ -416,6 +416,18 @@ type TCPInfo struct { RwndLimited uint64 // SndBufLimited is the time in microseconds limited by send buffer. SndBufLimited uint64 + + Delievered uint32 + DelieveredCe uint32 + + // BytesSent is RFC4898 tcpEStatsPerfHCDataOctetsOut. + BytesSent uint64 + // BytesRetrans is RFC4898 tcpEStatsPerfOctetsRetrans. + BytesRetrans uint64 + // DSACKDups is RFC4898 tcpEStatsStackDSACKDups. + DSACKDups uint32 + // ReordSeen is the number of reordering events seen. + ReordSeen uint32 } // SizeOfTCPInfo is the binary size of a TCPInfo struct. diff --git a/pkg/abi/linux/tcp.go b/pkg/abi/linux/tcp.go index 2a8d4708b..1a3c0916f 100644 --- a/pkg/abi/linux/tcp.go +++ b/pkg/abi/linux/tcp.go @@ -59,3 +59,12 @@ const ( MAX_TCP_KEEPINTVL = 32767 MAX_TCP_KEEPCNT = 127 ) + +// Congestion control states from include/uapi/linux/tcp.h. +const ( + TCP_CA_Open = 0 + TCP_CA_Disorder = 1 + TCP_CA_CWR = 2 + TCP_CA_Recovery = 3 + TCP_CA_Loss = 4 +) diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 7065a0e46..3115a227d 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -1098,6 +1098,29 @@ func getSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name, // TODO(b/64800844): Translate fields once they are added to // tcpip.TCPInfoOption. info := linux.TCPInfo{} + switch v.CcState { + case tcpip.RTORecovery: + info.CaState = linux.TCP_CA_Loss + case tcpip.FastRecovery, tcpip.SACKRecovery: + info.CaState = linux.TCP_CA_Recovery + case tcpip.Disorder: + info.CaState = linux.TCP_CA_Disorder + case tcpip.Open: + info.CaState = linux.TCP_CA_Open + } + info.RTO = uint32(v.RTO / time.Microsecond) + info.RTT = uint32(v.RTT / time.Microsecond) + info.RTTVar = uint32(v.RTTVar / time.Microsecond) + info.SndSsthresh = v.SndSsthresh + info.SndCwnd = v.SndCwnd + + // In netstack reorderSeen is updated only when RACK is enabled. + // We only track whether the reordering is seen, which is + // different than Linux where reorderSeen is not specific to + // RACK and is incremented when a reordering event is seen. + if v.ReorderSeen { + info.ReordSeen = 1 + } // Linux truncates the output binary to outLen. buf := t.CopyScratchBuffer(info.SizeBytes()) diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 812ee36ed..e70ae69ef 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -993,12 +993,54 @@ type SettableSocketOption interface { isSettableSocketOption() } +// CongestionControlState indicates the current congestion control state for +// TCP sender. +type CongestionControlState int + +const ( + // Open indicates that the sender is receiving acks in order and + // no loss or dupACK's etc have been detected. + Open CongestionControlState = iota + // RTORecovery indicates that an RTO has occurred and the sender + // has entered an RTO based recovery phase. + RTORecovery + // FastRecovery indicates that the sender has entered FastRecovery + // based on receiving nDupAck's. This state is entered only when + // SACK is not in use. + FastRecovery + // SACKRecovery indicates that the sender has entered SACK based + // recovery. + SACKRecovery + // Disorder indicates the sender either received some SACK blocks + // or dupACK's. + Disorder +) + // TCPInfoOption is used by GetSockOpt to expose TCP statistics. // // TODO(b/64800844): Add and populate stat fields. type TCPInfoOption struct { - RTT time.Duration + // RTT is the smoothed round trip time. + RTT time.Duration + + // RTTVar is the round trip time variation. RTTVar time.Duration + + // RTO is the retransmission timeout for the endpoint. + RTO time.Duration + + // CcState is the congestion control state. + CcState CongestionControlState + + // SndCwnd is the congestion window, in packets. + SndCwnd uint32 + + // SndSsthresh is the threshold between slow start and congestion + // avoidance. + SndSsthresh uint32 + + // ReorderSeen indicates if reordering is seen in the endpoint. + ReorderSeen bool } func (*TCPInfoOption) isGettableSocketOption() {} diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index b6bd6d455..bfa5b01fb 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -2011,20 +2011,34 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { } } +func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption { + info := tcpip.TCPInfoOption{} + e.LockUser() + snd := e.snd + if snd != nil { + // We do not calculate RTT before sending the data packets. If + // the connection did not send and receive data, then RTT will + // be zero. + snd.rtt.Lock() + info.RTT = snd.rtt.srtt + info.RTTVar = snd.rtt.rttvar + snd.rtt.Unlock() + + info.RTO = snd.rto + info.CcState = snd.state + info.SndSsthresh = uint32(snd.sndSsthresh) + info.SndCwnd = uint32(snd.sndCwnd) + info.ReorderSeen = snd.rc.reorderSeen + } + e.UnlockUser() + return info +} + // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { switch o := opt.(type) { case *tcpip.TCPInfoOption: - *o = tcpip.TCPInfoOption{} - e.LockUser() - snd := e.snd - e.UnlockUser() - if snd != nil { - snd.rtt.Lock() - o.RTT = snd.rtt.srtt - o.RTTVar = snd.rtt.rttvar - snd.rtt.Unlock() - } + *o = e.getTCPInfo() case *tcpip.KeepaliveIdleOption: e.keepalive.Lock() diff --git a/pkg/tcpip/transport/tcp/rack.go b/pkg/tcpip/transport/tcp/rack.go index 307bacca5..fdb7e3dc6 100644 --- a/pkg/tcpip/transport/tcp/rack.go +++ b/pkg/tcpip/transport/tcp/rack.go @@ -162,7 +162,7 @@ func (s *sender) shouldSchedulePTO() bool { // The connection supports SACK. s.ep.sackPermitted && // The connection is not in loss recovery. - (s.state != RTORecovery && s.state != SACKRecovery) && + (s.state != tcpip.RTORecovery && s.state != tcpip.SACKRecovery) && // The connection has no SACKed sequences in the SACK scoreboard. s.ep.scoreboard.Sacked() == 0 } diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 079d90848..28ef9f899 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -48,28 +48,6 @@ const ( MaxRetries = 15 ) -// ccState indicates the current congestion control state for this sender. -type ccState int - -const ( - // Open indicates that the sender is receiving acks in order and - // no loss or dupACK's etc have been detected. - Open ccState = iota - // RTORecovery indicates that an RTO has occurred and the sender - // has entered an RTO based recovery phase. - RTORecovery - // FastRecovery indicates that the sender has entered FastRecovery - // based on receiving nDupAck's. This state is entered only when - // SACK is not in use. - FastRecovery - // SACKRecovery indicates that the sender has entered SACK based - // recovery. - SACKRecovery - // Disorder indicates the sender either received some SACK blocks - // or dupACK's. - Disorder -) - // congestionControl is an interface that must be implemented by any supported // congestion control algorithm. type congestionControl interface { @@ -204,7 +182,7 @@ type sender struct { maxSentAck seqnum.Value // state is the current state of congestion control for this endpoint. - state ccState + state tcpip.CongestionControlState // cc is the congestion control algorithm in use for this sender. cc congestionControl @@ -593,7 +571,7 @@ func (s *sender) retransmitTimerExpired() bool { s.leaveRecovery() } - s.state = RTORecovery + s.state = tcpip.RTORecovery s.cc.HandleRTOExpired() // Mark the next segment to be sent as the first unacknowledged one and @@ -1018,7 +996,7 @@ func (s *sender) sendData() { // "A TCP SHOULD set cwnd to no more than RW before beginning // transmission if the TCP has not sent data in the interval exceeding // the retrasmission timeout." - if !s.fr.active && s.state != RTORecovery && time.Now().Sub(s.lastSendTime) > s.rto { + if !s.fr.active && s.state != tcpip.RTORecovery && time.Now().Sub(s.lastSendTime) > s.rto { if s.sndCwnd > InitialCwnd { s.sndCwnd = InitialCwnd } @@ -1062,14 +1040,14 @@ func (s *sender) enterRecovery() { s.fr.highRxt = s.sndUna s.fr.rescueRxt = s.sndUna if s.ep.sackPermitted { - s.state = SACKRecovery + s.state = tcpip.SACKRecovery s.ep.stack.Stats().TCP.SACKRecovery.Increment() // Set TLPRxtOut to false according to // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. s.rc.tlpRxtOut = false return } - s.state = FastRecovery + s.state = tcpip.FastRecovery s.ep.stack.Stats().TCP.FastRecovery.Increment() } @@ -1166,7 +1144,7 @@ func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) { s.fr.highRxt = s.sndUna - 1 // Do run SetPipe() to calculate the outstanding segments. s.SetPipe() - s.state = Disorder + s.state = tcpip.Disorder return false } @@ -1464,7 +1442,7 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) { if !s.fr.active { s.cc.Update(originalOutstanding - s.outstanding) if s.fr.last.LessThan(s.sndUna) { - s.state = Open + s.state = tcpip.Open } } diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl index 5c3c569de..2b9bfac76 100644 --- a/test/packetimpact/runner/defs.bzl +++ b/test/packetimpact/runner/defs.bzl @@ -281,6 +281,9 @@ ALL_TESTS = [ name = "tcp_rack", expect_netstack_failure = True, ), + PacketimpactTestInfo( + name = "tcp_info", + ), ] def validate_all_tests(): diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD index 6c6f2bdf7..42aad541f 100644 --- a/test/packetimpact/tests/BUILD +++ b/test/packetimpact/tests/BUILD @@ -390,6 +390,19 @@ packetimpact_testbench( ], ) +packetimpact_testbench( + name = "tcp_info", + srcs = ["tcp_info_test.go"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/tcpip/header", + "//pkg/usermem", + "//test/packetimpact/testbench", + "@org_golang_x_sys//unix:go_default_library", + ], +) + validate_all_tests() [packetimpact_go_test( diff --git a/test/packetimpact/tests/tcp_info_test.go b/test/packetimpact/tests/tcp_info_test.go new file mode 100644 index 000000000..b66e8f609 --- /dev/null +++ b/test/packetimpact/tests/tcp_info_test.go @@ -0,0 +1,103 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_info_test + +import ( + "flag" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/test/packetimpact/testbench" +) + +func init() { + testbench.Initialize(flag.CommandLine) +} + +func TestTCPInfo(t *testing.T) { + // Create a socket, listen, TCP connect, and accept. + dut := testbench.NewDUT(t) + listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFD) + + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + defer conn.Close(t) + conn.Connect(t) + + acceptFD, _ := dut.Accept(t, listenFD) + defer dut.Close(t, acceptFD) + + // Send and receive sample data. + sampleData := []byte("Sample Data") + samplePayload := &testbench.Payload{Bytes: sampleData} + dut.Send(t, acceptFD, sampleData, 0) + if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { + t.Fatalf("expected a packet with payload %v: %s", samplePayload, err) + } + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}) + + info := linux.TCPInfo{} + infoBytes := dut.GetSockOpt(t, acceptFD, unix.SOL_TCP, unix.TCP_INFO, int32(linux.SizeOfTCPInfo)) + binary.Unmarshal(infoBytes, usermem.ByteOrder, &info) + + rtt := time.Duration(info.RTT) * time.Microsecond + rttvar := time.Duration(info.RTTVar) * time.Microsecond + rto := time.Duration(info.RTO) * time.Microsecond + if rtt == 0 || rttvar == 0 || rto == 0 { + t.Errorf("expected rtt(%v), rttvar(%v) and rto(%v) to be greater than zero", rtt, rttvar, rto) + } + if info.ReordSeen != 0 { + t.Errorf("expected the connection to not have any reordering, got: %v want: 0", info.ReordSeen) + } + if info.SndCwnd == 0 { + t.Errorf("expected send congestion window to be greater than zero") + } + if info.CaState != linux.TCP_CA_Open { + t.Errorf("expected the connection to be in open state, got: %v want: %v", info.CaState, linux.TCP_CA_Open) + } + + if t.Failed() { + t.FailNow() + } + + // Check the congestion control state and send congestion window after + // retransmission timeout. + seq := testbench.Uint32(uint32(*conn.RemoteSeqNum(t))) + dut.Send(t, acceptFD, sampleData, 0) + if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil { + t.Fatalf("expected a packet with payload %v: %s", samplePayload, err) + } + + // Expect retransmission of the packet within 1.5*RTO. + timeout := time.Duration(float64(info.RTO)*1.5) * time.Microsecond + if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: seq}, samplePayload, timeout); err != nil { + t.Fatalf("expected a packet with payload %v: %s", samplePayload, err) + } + + info = linux.TCPInfo{} + infoBytes = dut.GetSockOpt(t, acceptFD, unix.SOL_TCP, unix.TCP_INFO, int32(linux.SizeOfTCPInfo)) + binary.Unmarshal(infoBytes, usermem.ByteOrder, &info) + if info.CaState != linux.TCP_CA_Loss { + t.Errorf("expected the connection to be in loss recovery, got: %v want: %v", info.CaState, linux.TCP_CA_Loss) + } + if info.SndCwnd != 1 { + t.Errorf("expected send congestion window to be 1, got: %v %v", info.SndCwnd) + } +} diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc index 831d96262..a73987a7e 100644 --- a/test/syscalls/linux/socket_ip_tcp_generic.cc +++ b/test/syscalls/linux/socket_ip_tcp_generic.cc @@ -65,6 +65,33 @@ TEST_P(TCPSocketPairTest, ZeroTcpInfoSucceeds) { SyscallSucceeds()); } +TEST_P(TCPSocketPairTest, CheckTcpInfoFields) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + char buf[10] = {}; + ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), buf, sizeof(buf), 0), + SyscallSucceedsWithValue(sizeof(buf))); + + // Wait until second_fd sees the data and then recv it. + struct pollfd poll_fd = {sockets->second_fd(), POLLIN, 0}; + constexpr int kPollTimeoutMs = 2000; // Wait up to 2 seconds for the data. + ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs), + SyscallSucceedsWithValue(1)); + + ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), buf, sizeof(buf), 0), + SyscallSucceedsWithValue(sizeof(buf))); + + struct tcp_info opt = {}; + socklen_t optLen = sizeof(opt); + ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_TCP, TCP_INFO, &opt, &optLen), + SyscallSucceeds()); + + // Validates the received tcp_info fields. + EXPECT_EQ(opt.tcpi_ca_state, 0); + EXPECT_GT(opt.tcpi_snd_cwnd, 0); + EXPECT_GT(opt.tcpi_rto, 0); +} + // This test validates that an RST is sent instead of a FIN when data is // unread on calls to close(2). TEST_P(TCPSocketPairTest, RSTSentOnCloseWithUnreadData) { |