summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorNayana Bidari <nybidari@google.com>2021-01-27 16:11:49 -0800
committergVisor bot <gvisor-bot@google.com>2021-01-27 16:14:50 -0800
commit99988e45ed651f64e16e2f2663b06b4a1eee50d4 (patch)
treeb5e4682669273698d6c84fbec56ace958afcf43a
parentcdf49c4433a83d9be6e8a3fb8b09bf457661d39f (diff)
Add support for more fields in netstack for TCP_INFO
This CL adds support for the following fields: - RTT, RTTVar, RTO - send congestion window (sndCwnd) and send slow start threshold (sndSsthresh) - congestion control state(CaState) - ReorderSeen PiperOrigin-RevId: 354195361
-rw-r--r--pkg/abi/linux/socket.go12
-rw-r--r--pkg/abi/linux/tcp.go9
-rw-r--r--pkg/sentry/socket/netstack/netstack.go23
-rw-r--r--pkg/tcpip/tcpip.go44
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go34
-rw-r--r--pkg/tcpip/transport/tcp/rack.go2
-rw-r--r--pkg/tcpip/transport/tcp/snd.go36
-rw-r--r--test/packetimpact/runner/defs.bzl3
-rw-r--r--test/packetimpact/tests/BUILD13
-rw-r--r--test/packetimpact/tests/tcp_info_test.go103
-rw-r--r--test/syscalls/linux/socket_ip_tcp_generic.cc27
11 files changed, 265 insertions, 41 deletions
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 8591acbf2..cb33c37bd 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -416,6 +416,18 @@ type TCPInfo struct {
RwndLimited uint64
// SndBufLimited is the time in microseconds limited by send buffer.
SndBufLimited uint64
+
+ Delievered uint32
+ DelieveredCe uint32
+
+ // BytesSent is RFC4898 tcpEStatsPerfHCDataOctetsOut.
+ BytesSent uint64
+ // BytesRetrans is RFC4898 tcpEStatsPerfOctetsRetrans.
+ BytesRetrans uint64
+ // DSACKDups is RFC4898 tcpEStatsStackDSACKDups.
+ DSACKDups uint32
+ // ReordSeen is the number of reordering events seen.
+ ReordSeen uint32
}
// SizeOfTCPInfo is the binary size of a TCPInfo struct.
diff --git a/pkg/abi/linux/tcp.go b/pkg/abi/linux/tcp.go
index 2a8d4708b..1a3c0916f 100644
--- a/pkg/abi/linux/tcp.go
+++ b/pkg/abi/linux/tcp.go
@@ -59,3 +59,12 @@ const (
MAX_TCP_KEEPINTVL = 32767
MAX_TCP_KEEPCNT = 127
)
+
+// Congestion control states from include/uapi/linux/tcp.h.
+const (
+ TCP_CA_Open = 0
+ TCP_CA_Disorder = 1
+ TCP_CA_CWR = 2
+ TCP_CA_Recovery = 3
+ TCP_CA_Loss = 4
+)
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 7065a0e46..3115a227d 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1098,6 +1098,29 @@ func getSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name,
// TODO(b/64800844): Translate fields once they are added to
// tcpip.TCPInfoOption.
info := linux.TCPInfo{}
+ switch v.CcState {
+ case tcpip.RTORecovery:
+ info.CaState = linux.TCP_CA_Loss
+ case tcpip.FastRecovery, tcpip.SACKRecovery:
+ info.CaState = linux.TCP_CA_Recovery
+ case tcpip.Disorder:
+ info.CaState = linux.TCP_CA_Disorder
+ case tcpip.Open:
+ info.CaState = linux.TCP_CA_Open
+ }
+ info.RTO = uint32(v.RTO / time.Microsecond)
+ info.RTT = uint32(v.RTT / time.Microsecond)
+ info.RTTVar = uint32(v.RTTVar / time.Microsecond)
+ info.SndSsthresh = v.SndSsthresh
+ info.SndCwnd = v.SndCwnd
+
+ // In netstack reorderSeen is updated only when RACK is enabled.
+ // We only track whether the reordering is seen, which is
+ // different than Linux where reorderSeen is not specific to
+ // RACK and is incremented when a reordering event is seen.
+ if v.ReorderSeen {
+ info.ReordSeen = 1
+ }
// Linux truncates the output binary to outLen.
buf := t.CopyScratchBuffer(info.SizeBytes())
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 812ee36ed..e70ae69ef 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -993,12 +993,54 @@ type SettableSocketOption interface {
isSettableSocketOption()
}
+// CongestionControlState indicates the current congestion control state for
+// TCP sender.
+type CongestionControlState int
+
+const (
+ // Open indicates that the sender is receiving acks in order and
+ // no loss or dupACK's etc have been detected.
+ Open CongestionControlState = iota
+ // RTORecovery indicates that an RTO has occurred and the sender
+ // has entered an RTO based recovery phase.
+ RTORecovery
+ // FastRecovery indicates that the sender has entered FastRecovery
+ // based on receiving nDupAck's. This state is entered only when
+ // SACK is not in use.
+ FastRecovery
+ // SACKRecovery indicates that the sender has entered SACK based
+ // recovery.
+ SACKRecovery
+ // Disorder indicates the sender either received some SACK blocks
+ // or dupACK's.
+ Disorder
+)
+
// TCPInfoOption is used by GetSockOpt to expose TCP statistics.
//
// TODO(b/64800844): Add and populate stat fields.
type TCPInfoOption struct {
- RTT time.Duration
+ // RTT is the smoothed round trip time.
+ RTT time.Duration
+
+ // RTTVar is the round trip time variation.
RTTVar time.Duration
+
+ // RTO is the retransmission timeout for the endpoint.
+ RTO time.Duration
+
+ // CcState is the congestion control state.
+ CcState CongestionControlState
+
+ // SndCwnd is the congestion window, in packets.
+ SndCwnd uint32
+
+ // SndSsthresh is the threshold between slow start and congestion
+ // avoidance.
+ SndSsthresh uint32
+
+ // ReorderSeen indicates if reordering is seen in the endpoint.
+ ReorderSeen bool
}
func (*TCPInfoOption) isGettableSocketOption() {}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index b6bd6d455..bfa5b01fb 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2011,20 +2011,34 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
}
}
+func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption {
+ info := tcpip.TCPInfoOption{}
+ e.LockUser()
+ snd := e.snd
+ if snd != nil {
+ // We do not calculate RTT before sending the data packets. If
+ // the connection did not send and receive data, then RTT will
+ // be zero.
+ snd.rtt.Lock()
+ info.RTT = snd.rtt.srtt
+ info.RTTVar = snd.rtt.rttvar
+ snd.rtt.Unlock()
+
+ info.RTO = snd.rto
+ info.CcState = snd.state
+ info.SndSsthresh = uint32(snd.sndSsthresh)
+ info.SndCwnd = uint32(snd.sndCwnd)
+ info.ReorderSeen = snd.rc.reorderSeen
+ }
+ e.UnlockUser()
+ return info
+}
+
// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
switch o := opt.(type) {
case *tcpip.TCPInfoOption:
- *o = tcpip.TCPInfoOption{}
- e.LockUser()
- snd := e.snd
- e.UnlockUser()
- if snd != nil {
- snd.rtt.Lock()
- o.RTT = snd.rtt.srtt
- o.RTTVar = snd.rtt.rttvar
- snd.rtt.Unlock()
- }
+ *o = e.getTCPInfo()
case *tcpip.KeepaliveIdleOption:
e.keepalive.Lock()
diff --git a/pkg/tcpip/transport/tcp/rack.go b/pkg/tcpip/transport/tcp/rack.go
index 307bacca5..fdb7e3dc6 100644
--- a/pkg/tcpip/transport/tcp/rack.go
+++ b/pkg/tcpip/transport/tcp/rack.go
@@ -162,7 +162,7 @@ func (s *sender) shouldSchedulePTO() bool {
// The connection supports SACK.
s.ep.sackPermitted &&
// The connection is not in loss recovery.
- (s.state != RTORecovery && s.state != SACKRecovery) &&
+ (s.state != tcpip.RTORecovery && s.state != tcpip.SACKRecovery) &&
// The connection has no SACKed sequences in the SACK scoreboard.
s.ep.scoreboard.Sacked() == 0
}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 079d90848..28ef9f899 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -48,28 +48,6 @@ const (
MaxRetries = 15
)
-// ccState indicates the current congestion control state for this sender.
-type ccState int
-
-const (
- // Open indicates that the sender is receiving acks in order and
- // no loss or dupACK's etc have been detected.
- Open ccState = iota
- // RTORecovery indicates that an RTO has occurred and the sender
- // has entered an RTO based recovery phase.
- RTORecovery
- // FastRecovery indicates that the sender has entered FastRecovery
- // based on receiving nDupAck's. This state is entered only when
- // SACK is not in use.
- FastRecovery
- // SACKRecovery indicates that the sender has entered SACK based
- // recovery.
- SACKRecovery
- // Disorder indicates the sender either received some SACK blocks
- // or dupACK's.
- Disorder
-)
-
// congestionControl is an interface that must be implemented by any supported
// congestion control algorithm.
type congestionControl interface {
@@ -204,7 +182,7 @@ type sender struct {
maxSentAck seqnum.Value
// state is the current state of congestion control for this endpoint.
- state ccState
+ state tcpip.CongestionControlState
// cc is the congestion control algorithm in use for this sender.
cc congestionControl
@@ -593,7 +571,7 @@ func (s *sender) retransmitTimerExpired() bool {
s.leaveRecovery()
}
- s.state = RTORecovery
+ s.state = tcpip.RTORecovery
s.cc.HandleRTOExpired()
// Mark the next segment to be sent as the first unacknowledged one and
@@ -1018,7 +996,7 @@ func (s *sender) sendData() {
// "A TCP SHOULD set cwnd to no more than RW before beginning
// transmission if the TCP has not sent data in the interval exceeding
// the retrasmission timeout."
- if !s.fr.active && s.state != RTORecovery && time.Now().Sub(s.lastSendTime) > s.rto {
+ if !s.fr.active && s.state != tcpip.RTORecovery && time.Now().Sub(s.lastSendTime) > s.rto {
if s.sndCwnd > InitialCwnd {
s.sndCwnd = InitialCwnd
}
@@ -1062,14 +1040,14 @@ func (s *sender) enterRecovery() {
s.fr.highRxt = s.sndUna
s.fr.rescueRxt = s.sndUna
if s.ep.sackPermitted {
- s.state = SACKRecovery
+ s.state = tcpip.SACKRecovery
s.ep.stack.Stats().TCP.SACKRecovery.Increment()
// Set TLPRxtOut to false according to
// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
s.rc.tlpRxtOut = false
return
}
- s.state = FastRecovery
+ s.state = tcpip.FastRecovery
s.ep.stack.Stats().TCP.FastRecovery.Increment()
}
@@ -1166,7 +1144,7 @@ func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) {
s.fr.highRxt = s.sndUna - 1
// Do run SetPipe() to calculate the outstanding segments.
s.SetPipe()
- s.state = Disorder
+ s.state = tcpip.Disorder
return false
}
@@ -1464,7 +1442,7 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
if !s.fr.active {
s.cc.Update(originalOutstanding - s.outstanding)
if s.fr.last.LessThan(s.sndUna) {
- s.state = Open
+ s.state = tcpip.Open
}
}
diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl
index 5c3c569de..2b9bfac76 100644
--- a/test/packetimpact/runner/defs.bzl
+++ b/test/packetimpact/runner/defs.bzl
@@ -281,6 +281,9 @@ ALL_TESTS = [
name = "tcp_rack",
expect_netstack_failure = True,
),
+ PacketimpactTestInfo(
+ name = "tcp_info",
+ ),
]
def validate_all_tests():
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 6c6f2bdf7..42aad541f 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -390,6 +390,19 @@ packetimpact_testbench(
],
)
+packetimpact_testbench(
+ name = "tcp_info",
+ srcs = ["tcp_info_test.go"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/tcpip/header",
+ "//pkg/usermem",
+ "//test/packetimpact/testbench",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
validate_all_tests()
[packetimpact_go_test(
diff --git a/test/packetimpact/tests/tcp_info_test.go b/test/packetimpact/tests/tcp_info_test.go
new file mode 100644
index 000000000..b66e8f609
--- /dev/null
+++ b/test/packetimpact/tests/tcp_info_test.go
@@ -0,0 +1,103 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_info_test
+
+import (
+ "flag"
+ "testing"
+ "time"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+ testbench.Initialize(flag.CommandLine)
+}
+
+func TestTCPInfo(t *testing.T) {
+ // Create a socket, listen, TCP connect, and accept.
+ dut := testbench.NewDUT(t)
+ listenFD, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+ defer dut.Close(t, listenFD)
+
+ conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+ defer conn.Close(t)
+ conn.Connect(t)
+
+ acceptFD, _ := dut.Accept(t, listenFD)
+ defer dut.Close(t, acceptFD)
+
+ // Send and receive sample data.
+ sampleData := []byte("Sample Data")
+ samplePayload := &testbench.Payload{Bytes: sampleData}
+ dut.Send(t, acceptFD, sampleData, 0)
+ if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil {
+ t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+ }
+ conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+
+ info := linux.TCPInfo{}
+ infoBytes := dut.GetSockOpt(t, acceptFD, unix.SOL_TCP, unix.TCP_INFO, int32(linux.SizeOfTCPInfo))
+ binary.Unmarshal(infoBytes, usermem.ByteOrder, &info)
+
+ rtt := time.Duration(info.RTT) * time.Microsecond
+ rttvar := time.Duration(info.RTTVar) * time.Microsecond
+ rto := time.Duration(info.RTO) * time.Microsecond
+ if rtt == 0 || rttvar == 0 || rto == 0 {
+ t.Errorf("expected rtt(%v), rttvar(%v) and rto(%v) to be greater than zero", rtt, rttvar, rto)
+ }
+ if info.ReordSeen != 0 {
+ t.Errorf("expected the connection to not have any reordering, got: %v want: 0", info.ReordSeen)
+ }
+ if info.SndCwnd == 0 {
+ t.Errorf("expected send congestion window to be greater than zero")
+ }
+ if info.CaState != linux.TCP_CA_Open {
+ t.Errorf("expected the connection to be in open state, got: %v want: %v", info.CaState, linux.TCP_CA_Open)
+ }
+
+ if t.Failed() {
+ t.FailNow()
+ }
+
+ // Check the congestion control state and send congestion window after
+ // retransmission timeout.
+ seq := testbench.Uint32(uint32(*conn.RemoteSeqNum(t)))
+ dut.Send(t, acceptFD, sampleData, 0)
+ if _, err := conn.ExpectData(t, &testbench.TCP{}, samplePayload, time.Second); err != nil {
+ t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+ }
+
+ // Expect retransmission of the packet within 1.5*RTO.
+ timeout := time.Duration(float64(info.RTO)*1.5) * time.Microsecond
+ if _, err := conn.ExpectData(t, &testbench.TCP{SeqNum: seq}, samplePayload, timeout); err != nil {
+ t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
+ }
+
+ info = linux.TCPInfo{}
+ infoBytes = dut.GetSockOpt(t, acceptFD, unix.SOL_TCP, unix.TCP_INFO, int32(linux.SizeOfTCPInfo))
+ binary.Unmarshal(infoBytes, usermem.ByteOrder, &info)
+ if info.CaState != linux.TCP_CA_Loss {
+ t.Errorf("expected the connection to be in loss recovery, got: %v want: %v", info.CaState, linux.TCP_CA_Loss)
+ }
+ if info.SndCwnd != 1 {
+ t.Errorf("expected send congestion window to be 1, got: %v %v", info.SndCwnd)
+ }
+}
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 831d96262..a73987a7e 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -65,6 +65,33 @@ TEST_P(TCPSocketPairTest, ZeroTcpInfoSucceeds) {
SyscallSucceeds());
}
+TEST_P(TCPSocketPairTest, CheckTcpInfoFields) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char buf[10] = {};
+ ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Wait until second_fd sees the data and then recv it.
+ struct pollfd poll_fd = {sockets->second_fd(), POLLIN, 0};
+ constexpr int kPollTimeoutMs = 2000; // Wait up to 2 seconds for the data.
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ struct tcp_info opt = {};
+ socklen_t optLen = sizeof(opt);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_TCP, TCP_INFO, &opt, &optLen),
+ SyscallSucceeds());
+
+ // Validates the received tcp_info fields.
+ EXPECT_EQ(opt.tcpi_ca_state, 0);
+ EXPECT_GT(opt.tcpi_snd_cwnd, 0);
+ EXPECT_GT(opt.tcpi_rto, 0);
+}
+
// This test validates that an RST is sent instead of a FIN when data is
// unread on calls to close(2).
TEST_P(TCPSocketPairTest, RSTSentOnCloseWithUnreadData) {