Fix two bugs in TCP sender.

a) When GSO is in use we should not cap the segment to maxPayloadSize in sender.maybeSendSegment as the GSO logic will cap the segment to the correct size. Without this the host GSO is not used as we end up breaking up large segments into small MSS sized segments before writing the packets to the host. b) The check to not split a segment due to it not fitting in the receiver window when there are pending segments is incorrect as segments in writeList can be really large as we just take the write call's buffer size and create a single large segment. So a write of say 128KB will just be 1 segment in the writeList. The linux code checks if 1 MSS sized segments fits in the receiver's window and if not then does not split the current segment. gVisor's check was incorrect that it was checking if the whole segment which could be >>> 1 MSS would fit in the receiver's window. This was causing us to prematurely stop sending and falling back to retransmit timer/probe from the other end to send data. This was seen when running HTTPD benchmarks where @ HEAD when sending large files the benchmark was taking forever to run. The tcp_splitseg_mss_test.go is being deleted as the test as written doesn't test what is intended correctly. This is because GSO is enabled by default and the reason the MSS+1 sized segment is sent is because GSO is in use. A proper test will require disabling GSO on linux and netstack which is going to take a bit of work in packetimpact to do it correctly. Separately a new test probably should be written that verifies that a segment > availableWindow is not split if the availableWindow is < 1 MSS. Fixes #3107 PiperOrigin-RevId: 319172089
author: Bhasker Hariharan <bhaskerh@google.com> 2020-06-30 23:55:16 -0700
committer: gVisor bot <gvisor-bot@google.com> 2020-06-30 23:56:34 -0700
commit: c9446f05347b865d75bfdacd6769fce265171b9b (patch)
tree: 6004738ac2116aff3814f431bd4e681d8c4ae181
parent: 43f5dd95a1c58a6e3260c31093bfc3f97885f4b0 (diff)
4 files changed, 46 insertions, 106 deletions
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index acacb42e4..5862c32f2 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -833,25 +833,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 			panic("Netstack queues FIN segments without data.")
 		}
 
-		segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
-		// If the entire segment cannot be accomodated in the receiver
-		// advertized window, skip splitting and sending of the segment.
-		// ref: net/ipv4/tcp_output.c::tcp_snd_wnd_test()
-		//
-		// Linux checks this for all segment transmits not triggered
-		// by a probe timer. On this condition, it defers the segment
-		// split and transmit to a short probe timer.
-		// ref: include/net/tcp.h::tcp_check_probe_timer()
-		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
-		//
-		// Instead of defining a new transmit timer, we attempt to split the
-		// segment right here if there are no pending segments.
-		// If there are pending segments, segment transmits are deferred
-		// to the retransmit timer handler.
-		if s.sndUna != s.sndNxt && !segEnd.LessThan(end) {
-			return false
-		}
-
 		if !seg.sequenceNumber.LessThan(end) {
 			return false
 		}
@@ -861,14 +842,48 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 			return false
 		}
 
-		// The segment size limit is computed as a function of sender congestion
-		// window and MSS. When sender congestion window is > 1, this limit can
-		// be larger than MSS. Ensure that the currently available send space
-		// is not greater than minimum of this limit and MSS.
+		// If the whole segment or at least 1MSS sized segment cannot
+		// be accomodated in the receiver advertized window, skip
+		// splitting and sending of the segment. ref:
+		// net/ipv4/tcp_output.c::tcp_snd_wnd_test()
+		//
+		// Linux checks this for all segment transmits not triggered by
+		// a probe timer. On this condition, it defers the segment split
+		// and transmit to a short probe timer.
+		//
+		// ref: include/net/tcp.h::tcp_check_probe_timer()
+		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
+		//
+		// Instead of defining a new transmit timer, we attempt to split
+		// the segment right here if there are no pending segments. If
+		// there are pending segments, segment transmits are deferred to
+		// the retransmit timer handler.
+		if s.sndUna != s.sndNxt {
+			switch {
+			case available >= seg.data.Size():
+				// OK to send, the whole segments fits in the
+				// receiver's advertised window.
+			case available >= s.maxPayloadSize:
+				// OK to send, at least 1 MSS sized segment fits
+				// in the receiver's advertised window.
+			default:
+				return false
+			}
+		}
+
+		// The segment size limit is computed as a function of sender
+		// congestion window and MSS. When sender congestion window is >
+		// 1, this limit can be larger than MSS. Ensure that the
+		// currently available send space is not greater than minimum of
+		// this limit and MSS.
 		if available > limit {
 			available = limit
 		}
-		if available > s.maxPayloadSize {
+
+		// If GSO is not in use then cap available to
+		// maxPayloadSize. When GSO is in use the gVisor GSO logic or
+		// the host GSO logic will cap the segment to the correct size.
+		if s.ep.gso == nil && available > s.maxPayloadSize {
 			available = s.maxPayloadSize
 		}
 
diff --git a/test/packetimpact/runner/packetimpact_test.go b/test/packetimpact/runner/packetimpact_test.go
index 3cac5915f..c0a2620de 100644
--- a/test/packetimpact/runner/packetimpact_test.go
+++ b/test/packetimpact/runner/packetimpact_test.go
@@ -242,7 +242,13 @@ func TestOne(t *testing.T) {
 	}
 
 	// Kill so that it will flush output.
-	defer testbench.Exec(dockerutil.RunOpts{}, "killall", snifferArgs[0])
+	defer func() {
+		// Wait 1 second before killing tcpdump to give it time to flush
+		// any packets.  On linux tests killing it immediately can
+		// sometimes result in partial pcaps.
+		time.Sleep(1 * time.Second)
+		testbench.Exec(dockerutil.RunOpts{}, "killall", snifferArgs[0])
+	}()
 
 	if _, err := testbench.WaitForOutput(snifferRegex, 60*time.Second); err != nil {
 		t.Fatalf("sniffer on %s never listened: %s", dut.Name, err)
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index 1c9f59df5..4711c1ae6 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -213,16 +213,6 @@ packetimpact_go_test(
 )
 
 packetimpact_go_test(
-    name = "tcp_splitseg_mss",
-    srcs = ["tcp_splitseg_mss_test.go"],
-    deps = [
-        "//pkg/tcpip/header",
-        "//test/packetimpact/testbench",
-        "@org_golang_x_sys//unix:go_default_library",
-    ],
-)
-
-packetimpact_go_test(
     name = "tcp_cork_mss",
     srcs = ["tcp_cork_mss_test.go"],
     deps = [
diff --git a/test/packetimpact/tests/tcp_splitseg_mss_test.go b/test/packetimpact/tests/tcp_splitseg_mss_test.go
deleted file mode 100644
index 9350d0988..000000000
--- a/test/packetimpact/tests/tcp_splitseg_mss_test.go
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tcp_splitseg_mss_test
-
-import (
-	"flag"
-	"testing"
-	"time"
-
-	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-	"gvisor.dev/gvisor/test/packetimpact/testbench"
-)
-
-func init() {
-	testbench.RegisterFlags(flag.CommandLine)
-}
-
-// TestTCPSplitSegMSS lets the dut try to send segments larger than MSS.
-// It tests if the transmitted segments are capped at MSS and are split.
-func TestTCPSplitSegMSS(t *testing.T) {
-	dut := testbench.NewDUT(t)
-	defer dut.TearDown()
-	listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
-	defer dut.Close(listenFD)
-	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
-	defer conn.Close()
-
-	const mss = uint32(header.TCPDefaultMSS)
-	options := make([]byte, header.TCPOptionMSSLength)
-	header.EncodeMSSOption(mss, options)
-	conn.ConnectWithOptions(options)
-
-	acceptFD, _ := dut.Accept(listenFD)
-	defer dut.Close(acceptFD)
-
-	// Let the dut send a segment larger than MSS.
-	largeData := make([]byte, mss+1)
-	for i := 0; i < 2; i++ {
-		dut.Send(acceptFD, largeData, 0)
-		if i == 0 {
-			// On Linux, the initial segment goes out beyond MSS and the segment
-			// split occurs on retransmission. Call ExpectData to wait to
-			// receive the split segment.
-			if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: largeData[:mss]}, time.Second); err != nil {
-				t.Fatalf("expected payload was not received: %s", err)
-			}
-		} else {
-			if _, err := conn.ExpectNextData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: largeData[:mss]}, time.Second); err != nil {
-				t.Fatalf("expected payload was not received: %s", err)
-			}
-		}
-		conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
-		if _, err := conn.ExpectNextData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &testbench.Payload{Bytes: largeData[mss:]}, time.Second); err != nil {
-			t.Fatalf("expected payload was not received: %s", err)
-		}
-		conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
-	}
-}
author	Bhasker Hariharan <bhaskerh@google.com>	2020-06-30 23:55:16 -0700
committer	gVisor bot <gvisor-bot@google.com>	2020-06-30 23:56:34 -0700
commit	c9446f05347b865d75bfdacd6769fce265171b9b (patch)
tree	6004738ac2116aff3814f431bd4e681d8c4ae181
parent	43f5dd95a1c58a6e3260c31093bfc3f97885f4b0 (diff)