diff options
author | Mithun Iyer <iyerm@google.com> | 2020-12-05 01:45:47 -0800 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2020-12-05 01:48:00 -0800 |
commit | 3075ede86edf04117f120b3cf110e699034b759c (patch) | |
tree | 7c62f7c40bbffc7d032b8f3b1d7b040933b28c94 | |
parent | df2dbe3e38aac73926a51d8045dbb419c6167cf5 (diff) |
Fix zero receive window advertisements.
With the recent changes db36d948fa63ce950d94a5e8e9ebc37956543661, we try
to balance the receive window advertisements between payload lengths vs
segment overhead length. This works fine when segment size are much
higher than the overhead, but not otherwise. In cases where the segment
length is smaller than the segment overhead, we may end up not
advertising zero receive window for long time and end up tail-dropping
segments. This is especially pronounced when application socket reads
are slow or stopped. In this change we do not grow the right edge of
the receive window for smaller segment sizes similar to Linux.
Also, we keep track of the socket buffer usage and let the window grow
if the application is actively reading data.
Fixes #4903
PiperOrigin-RevId: 345832012
-rw-r--r-- | pkg/tcpip/transport/tcp/connect.go | 11 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/rcv.go | 52 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/segment.go | 2 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/segment_unsafe.go | 3 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/tcp_test.go | 5 | ||||
-rw-r--r-- | test/packetimpact/runner/defs.bzl | 3 | ||||
-rw-r--r-- | test/packetimpact/tests/BUILD | 10 | ||||
-rw-r--r-- | test/packetimpact/tests/tcp_zero_receive_window_test.go | 113 |
8 files changed, 188 insertions, 11 deletions
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 31eded0ce..c944dccc0 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -16,6 +16,7 @@ package tcp import ( "encoding/binary" + "math" "time" "gvisor.dev/gvisor/pkg/rand" @@ -133,7 +134,7 @@ func FindWndScale(wnd seqnum.Size) int { return 0 } - max := seqnum.Size(0xffff) + max := seqnum.Size(math.MaxUint16) s := 0 for wnd > max && s < header.MaxWndScale { s++ @@ -818,8 +819,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso data = data.Clone(nil) optLen := len(tf.opts) - if tf.rcvWnd > 0xffff { - tf.rcvWnd = 0xffff + if tf.rcvWnd > math.MaxUint16 { + tf.rcvWnd = math.MaxUint16 } mss := int(gso.MSS) @@ -863,8 +864,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso // network endpoint and under the provided identity. func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error { optLen := len(tf.opts) - if tf.rcvWnd > 0xffff { - tf.rcvWnd = 0xffff + if tf.rcvWnd > math.MaxUint16 { + tf.rcvWnd = math.MaxUint16 } if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() { diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go index 8e0b7c843..f2b1b68da 100644 --- a/pkg/tcpip/transport/tcp/rcv.go +++ b/pkg/tcpip/transport/tcp/rcv.go @@ -16,6 +16,7 @@ package tcp import ( "container/heap" + "math" "time" "gvisor.dev/gvisor/pkg/tcpip" @@ -48,6 +49,10 @@ type receiver struct { rcvWndScale uint8 + // prevBufused is the snapshot of endpoint rcvBufUsed taken when we + // advertise a receive window. + prevBufUsed int + closed bool // pendingRcvdSegments is bounded by the receive buffer size of the @@ -80,9 +85,9 @@ func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool { // outgoing packets, we should use what we have advertised for acceptability // test. scaledWindowSize := r.rcvWnd >> r.rcvWndScale - if scaledWindowSize > 0xffff { + if scaledWindowSize > math.MaxUint16 { // This is what we actually put in the Window field. - scaledWindowSize = 0xffff + scaledWindowSize = math.MaxUint16 } advertisedWindowSize := scaledWindowSize << r.rcvWndScale return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize)) @@ -106,6 +111,34 @@ func (r *receiver) currentWindow() (curWnd seqnum.Size) { func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) { newWnd := r.ep.selectWindow() curWnd := r.currentWindow() + unackLen := int(r.ep.snd.maxSentAck.Size(r.rcvNxt)) + bufUsed := r.ep.receiveBufferUsed() + + // Grow the right edge of the window only for payloads larger than the + // the segment overhead OR if the application is actively consuming data. + // + // Avoiding growing the right edge otherwise, addresses a situation below: + // An application has been slow in reading data and we have burst of + // incoming segments lengths < segment overhead. Here, our available free + // memory would reduce drastically when compared to the advertised receive + // window. + // + // For example: With incoming 512 bytes segments, segment overhead of + // 552 bytes (at the time of writing this comment), with receive window + // starting from 1MB and with rcvAdvWndScale being 1, buffer would reach 0 + // when the curWnd is still 19436 bytes, because for every incoming segment + // newWnd would reduce by (552+512) >> rcvAdvWndScale (current value 1), + // while curWnd would reduce by 512 bytes. + // Such a situation causes us to keep tail dropping the incoming segments + // and never advertise zero receive window to the peer. + // + // Linux does a similar check for minimal sk_buff size (128): + // https://github.com/torvalds/linux/blob/d5beb3140f91b1c8a3d41b14d729aefa4dcc58bc/net/ipv4/tcp_input.c#L783 + // + // Also, if the application is reading the data, we keep growing the right + // edge, as we are still advertising a window that we think can be serviced. + toGrow := unackLen >= SegSize || bufUsed <= r.prevBufUsed + // Update rcvAcc only if new window is > previously advertised window. We // should never shrink the acceptable sequence space once it has been // advertised the peer. If we shrink the acceptable sequence space then we @@ -115,7 +148,7 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) { // rcvWUP rcvNxt rcvAcc new rcvAcc // <=====curWnd ===> // <========= newWnd > curWnd ========= > - if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) { + if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) && toGrow { // If the new window moves the right edge, then update rcvAcc. r.rcvAcc = r.rcvNxt.Add(seqnum.Size(newWnd)) } else { @@ -130,11 +163,24 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) { // receiver's estimated RTT. r.rcvWnd = newWnd r.rcvWUP = r.rcvNxt + r.prevBufUsed = bufUsed scaledWnd := r.rcvWnd >> r.rcvWndScale if scaledWnd == 0 { // Increment a metric if we are advertising an actual zero window. r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment() } + + // If we started off with a window larger than what can he held in + // the 16bit window field, we ceil the value to the max value. + // While ceiling, we still do not want to grow the right edge when + // not applicable. + if scaledWnd > math.MaxUint16 { + if toGrow { + scaledWnd = seqnum.Size(math.MaxUint16) + } else { + scaledWnd = seqnum.Size(uint16(scaledWnd)) + } + } return r.rcvNxt, scaledWnd } diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go index 2091989cc..5ef73ec74 100644 --- a/pkg/tcpip/transport/tcp/segment.go +++ b/pkg/tcpip/transport/tcp/segment.go @@ -204,7 +204,7 @@ func (s *segment) payloadSize() int { // segMemSize is the amount of memory used to hold the segment data and // the associated metadata. func (s *segment) segMemSize() int { - return segSize + s.data.Size() + return SegSize + s.data.Size() } // parse populates the sequence & ack numbers, flags, and window fields of the diff --git a/pkg/tcpip/transport/tcp/segment_unsafe.go b/pkg/tcpip/transport/tcp/segment_unsafe.go index 0ab7b8f56..392ff0859 100644 --- a/pkg/tcpip/transport/tcp/segment_unsafe.go +++ b/pkg/tcpip/transport/tcp/segment_unsafe.go @@ -19,5 +19,6 @@ import ( ) const ( - segSize = int(unsafe.Sizeof(segment{})) + // SegSize is the minimal size of the segment overhead. + SegSize = int(unsafe.Sizeof(segment{})) ) diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 98620e6db..1759ebea9 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -6098,10 +6098,13 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) { // Introduce a 25ms latency by delaying the first byte. latency := 25 * time.Millisecond time.Sleep(latency) - rawEP.SendPacketWithTS([]byte{1}, tsVal) + // Send an initial payload with atleast segment overhead size. The receive + // window would not grow for smaller segments. + rawEP.SendPacketWithTS(make([]byte, tcp.SegSize), tsVal) pkt := rawEP.VerifyAndReturnACKWithTS(tsVal) rcvWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize() + time.Sleep(25 * time.Millisecond) // Allocate a large enough payload for the test. diff --git a/test/packetimpact/runner/defs.bzl b/test/packetimpact/runner/defs.bzl index 86833eade..c6c95546a 100644 --- a/test/packetimpact/runner/defs.bzl +++ b/test/packetimpact/runner/defs.bzl @@ -224,6 +224,9 @@ ALL_TESTS = [ name = "tcp_user_timeout", ), PacketimpactTestInfo( + name = "tcp_zero_receive_window", + ), + PacketimpactTestInfo( name = "tcp_queue_receive_in_syn_sent", ), PacketimpactTestInfo( diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD index ce21c079a..373ab8d2f 100644 --- a/test/packetimpact/tests/BUILD +++ b/test/packetimpact/tests/BUILD @@ -366,6 +366,16 @@ packetimpact_testbench( ], ) +packetimpact_testbench( + name = "tcp_zero_receive_window", + srcs = ["tcp_zero_receive_window_test.go"], + deps = [ + "//pkg/tcpip/header", + "//test/packetimpact/testbench", + "@org_golang_x_sys//unix:go_default_library", + ], +) + validate_all_tests() [packetimpact_go_test( diff --git a/test/packetimpact/tests/tcp_zero_receive_window_test.go b/test/packetimpact/tests/tcp_zero_receive_window_test.go new file mode 100644 index 000000000..cf0431c57 --- /dev/null +++ b/test/packetimpact/tests/tcp_zero_receive_window_test.go @@ -0,0 +1,113 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_zero_receive_window_test + +import ( + "context" + "flag" + "fmt" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/test/packetimpact/testbench" +) + +func init() { + testbench.Initialize(flag.CommandLine) +} + +// TestZeroReceiveWindow tests if the DUT sends a zero receive window eventually. +func TestZeroReceiveWindow(t *testing.T) { + for _, payloadLen := range []int{64, 512, 1024} { + t.Run(fmt.Sprintf("TestZeroReceiveWindow_with_%dbytes_payload", payloadLen), func(t *testing.T) { + dut := testbench.NewDUT(t) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + defer conn.Close(t) + + conn.Connect(t) + acceptFd, _ := dut.Accept(t, listenFd) + defer dut.Close(t, acceptFd) + + dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + + samplePayload := &testbench.Payload{Bytes: make([]byte, payloadLen)} //testbench.GenerateRandomPayload(t, payloadLen)} + // Expect the DUT to eventually advertize zero receive window. + // The test would timeout otherwise. + for { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload) + gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) + if err != nil { + t.Fatalf("expected packet was not received: %s", err) + } + if *gotTCP.WindowSize == 0 { + break + } + } + }) + } +} + +// TestNonZeroReceiveWindow tests for the DUT to never send a zero receive +// window when the data is being read from the socket buffer. +func TestNonZeroReceiveWindow(t *testing.T) { + for _, payloadLen := range []int{64, 512, 1024} { + t.Run(fmt.Sprintf("TestZeroReceiveWindow_with_%dbytes_payload", payloadLen), func(t *testing.T) { + dut := testbench.NewDUT(t) + listenFd, remotePort := dut.CreateListener(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, 1) + defer dut.Close(t, listenFd) + conn := dut.Net.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + defer conn.Close(t) + + conn.Connect(t) + acceptFd, _ := dut.Accept(t, listenFd) + defer dut.Close(t, acceptFd) + + dut.SetSockOptInt(t, acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1) + + samplePayload := &testbench.Payload{Bytes: testbench.GenerateRandomPayload(t, payloadLen)} + var rcvWindow uint16 + initRcv := false + // This loop keeps a running rcvWindow value from the initial ACK for the data + // we sent. Once we have received ACKs with non-zero receive windows, we break + // the loop. + for { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload) + gotTCP, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second) + if err != nil { + t.Fatalf("expected packet was not received: %s", err) + } + if ret, _, err := dut.RecvWithErrno(context.Background(), t, acceptFd, int32(payloadLen), 0); ret == -1 { + t.Fatalf("dut.RecvWithErrno(ctx, t, %d, %d, 0) = %d,_, %s", acceptFd, payloadLen, ret, err) + } + + if *gotTCP.WindowSize == 0 { + t.Fatalf("expected non-zero receive window.") + } + if !initRcv { + rcvWindow = uint16(*gotTCP.WindowSize) + initRcv = true + } + if rcvWindow <= uint16(payloadLen) { + break + } + rcvWindow -= uint16(payloadLen) + } + }) + } +} |