summaryrefslogtreecommitdiffhomepage
path: root/pkg/tcpip/stack
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/tcpip/stack')
-rw-r--r--pkg/tcpip/stack/BUILD4
-rw-r--r--pkg/tcpip/stack/conntrack.go234
-rw-r--r--pkg/tcpip/stack/hook_string.go41
-rw-r--r--pkg/tcpip/stack/iptables.go7
-rw-r--r--pkg/tcpip/stack/iptables_targets.go78
-rw-r--r--pkg/tcpip/stack/ndp_test.go20
-rw-r--r--pkg/tcpip/stack/neighbor_entry_test.go8
-rw-r--r--pkg/tcpip/stack/packet_buffer.go11
-rw-r--r--pkg/tcpip/stack/route.go2
-rw-r--r--pkg/tcpip/stack/stack.go309
-rw-r--r--pkg/tcpip/stack/stack_global_state.go72
-rw-r--r--pkg/tcpip/stack/stack_options.go4
-rw-r--r--pkg/tcpip/stack/stack_test.go71
-rw-r--r--pkg/tcpip/stack/tcp.go451
-rw-r--r--pkg/tcpip/stack/transport_demuxer.go17
-rw-r--r--pkg/tcpip/stack/transport_test.go4
16 files changed, 853 insertions, 480 deletions
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 49362333a..2bd6a67f5 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -45,6 +45,7 @@ go_library(
"addressable_endpoint_state.go",
"conntrack.go",
"headertype_string.go",
+ "hook_string.go",
"icmp_rate_limit.go",
"iptables.go",
"iptables_state.go",
@@ -66,6 +67,7 @@ go_library(
"stack.go",
"stack_global_state.go",
"stack_options.go",
+ "tcp.go",
"transport_demuxer.go",
"tuple_list.go",
],
@@ -115,6 +117,7 @@ go_test(
"//pkg/tcpip/network/ipv4",
"//pkg/tcpip/network/ipv6",
"//pkg/tcpip/ports",
+ "//pkg/tcpip/testutil",
"//pkg/tcpip/transport/icmp",
"//pkg/tcpip/transport/udp",
"//pkg/waiter",
@@ -139,6 +142,7 @@ go_test(
"//pkg/tcpip/buffer",
"//pkg/tcpip/faketime",
"//pkg/tcpip/header",
+ "//pkg/tcpip/testutil",
"@com_github_google_go_cmp//cmp:go_default_library",
"@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
],
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
index 3f083928f..41e964cf3 100644
--- a/pkg/tcpip/stack/conntrack.go
+++ b/pkg/tcpip/stack/conntrack.go
@@ -16,6 +16,7 @@ package stack
import (
"encoding/binary"
+ "fmt"
"sync"
"time"
@@ -29,7 +30,7 @@ import (
// The connection is created for a packet if it does not exist. Every
// connection contains two tuples (original and reply). The tuples are
// manipulated if there is a matching NAT rule. The packet is modified by
-// looking at the tuples in the Prerouting and Output hooks.
+// looking at the tuples in each hook.
//
// Currently, only TCP tracking is supported.
@@ -46,12 +47,14 @@ const (
)
// Manipulation type for the connection.
+// TODO(gvisor.dev/issue/5696): Define this as a bit set and support SNAT and
+// DNAT at the same time.
type manipType int
const (
manipNone manipType = iota
- manipDstPrerouting
- manipDstOutput
+ manipSource
+ manipDestination
)
// tuple holds a connection's identifying and manipulating data in one
@@ -108,6 +111,7 @@ type conn struct {
reply tuple
// manip indicates if the packet should be manipulated. It is immutable.
+ // TODO(gvisor.dev/issue/5696): Support updating manipulation type.
manip manipType
// tcbHook indicates if the packet is inbound or outbound to
@@ -124,6 +128,18 @@ type conn struct {
lastUsed time.Time `state:".(unixTime)"`
}
+// newConn creates new connection.
+func newConn(orig, reply tupleID, manip manipType, hook Hook) *conn {
+ conn := conn{
+ manip: manip,
+ tcbHook: hook,
+ lastUsed: time.Now(),
+ }
+ conn.original = tuple{conn: &conn, tupleID: orig}
+ conn.reply = tuple{conn: &conn, tupleID: reply, direction: dirReply}
+ return &conn
+}
+
// timedOut returns whether the connection timed out based on its state.
func (cn *conn) timedOut(now time.Time) bool {
const establishedTimeout = 5 * 24 * time.Hour
@@ -219,18 +235,6 @@ func packetToTupleID(pkt *PacketBuffer) (tupleID, tcpip.Error) {
}, nil
}
-// newConn creates new connection.
-func newConn(orig, reply tupleID, manip manipType, hook Hook) *conn {
- conn := conn{
- manip: manip,
- tcbHook: hook,
- lastUsed: time.Now(),
- }
- conn.original = tuple{conn: &conn, tupleID: orig}
- conn.reply = tuple{conn: &conn, tupleID: reply, direction: dirReply}
- return &conn
-}
-
func (ct *ConnTrack) init() {
ct.mu.Lock()
defer ct.mu.Unlock()
@@ -284,20 +288,41 @@ func (ct *ConnTrack) insertRedirectConn(pkt *PacketBuffer, hook Hook, port uint1
return nil
}
- // Create a new connection and change the port as per the iptables
- // rule. This tuple will be used to manipulate the packet in
- // handlePacket.
replyTID := tid.reply()
replyTID.srcAddr = address
replyTID.srcPort = port
- var manip manipType
- switch hook {
- case Prerouting:
- manip = manipDstPrerouting
- case Output:
- manip = manipDstOutput
+
+ conn, _ := ct.connForTID(tid)
+ if conn != nil {
+ // The connection is already tracked.
+ // TODO(gvisor.dev/issue/5696): Support updating an existing connection.
+ return nil
}
- conn := newConn(tid, replyTID, manip, hook)
+ conn = newConn(tid, replyTID, manipDestination, hook)
+ ct.insertConn(conn)
+ return conn
+}
+
+func (ct *ConnTrack) insertSNATConn(pkt *PacketBuffer, hook Hook, port uint16, address tcpip.Address) *conn {
+ tid, err := packetToTupleID(pkt)
+ if err != nil {
+ return nil
+ }
+ if hook != Input && hook != Postrouting {
+ return nil
+ }
+
+ replyTID := tid.reply()
+ replyTID.dstAddr = address
+ replyTID.dstPort = port
+
+ conn, _ := ct.connForTID(tid)
+ if conn != nil {
+ // The connection is already tracked.
+ // TODO(gvisor.dev/issue/5696): Support updating an existing connection.
+ return nil
+ }
+ conn = newConn(tid, replyTID, manipSource, hook)
ct.insertConn(conn)
return conn
}
@@ -322,6 +347,7 @@ func (ct *ConnTrack) insertConn(conn *conn) {
// Now that we hold the locks, ensure the tuple hasn't been inserted by
// another thread.
+ // TODO(gvisor.dev/issue/5773): Should check conn.reply.tupleID, too?
alreadyInserted := false
for other := ct.buckets[tupleBucket].tuples.Front(); other != nil; other = other.Next() {
if other.tupleID == conn.original.tupleID {
@@ -343,86 +369,6 @@ func (ct *ConnTrack) insertConn(conn *conn) {
}
}
-// handlePacketPrerouting manipulates ports for packets in Prerouting hook.
-// TODO(gvisor.dev/issue/170): Change address for Prerouting hook.
-func handlePacketPrerouting(pkt *PacketBuffer, conn *conn, dir direction) {
- // If this is a noop entry, don't do anything.
- if conn.manip == manipNone {
- return
- }
-
- netHeader := pkt.Network()
- tcpHeader := header.TCP(pkt.TransportHeader().View())
-
- // For prerouting redirection, packets going in the original direction
- // have their destinations modified and replies have their sources
- // modified.
- switch dir {
- case dirOriginal:
- port := conn.reply.srcPort
- tcpHeader.SetDestinationPort(port)
- netHeader.SetDestinationAddress(conn.reply.srcAddr)
- case dirReply:
- port := conn.original.dstPort
- tcpHeader.SetSourcePort(port)
- netHeader.SetSourceAddress(conn.original.dstAddr)
- }
-
- // TODO(gvisor.dev/issue/170): TCP checksums aren't usually validated
- // on inbound packets, so we don't recalculate them. However, we should
- // support cases when they are validated, e.g. when we can't offload
- // receive checksumming.
-
- // After modification, IPv4 packets need a valid checksum.
- if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
- netHeader := header.IPv4(pkt.NetworkHeader().View())
- netHeader.SetChecksum(0)
- netHeader.SetChecksum(^netHeader.CalculateChecksum())
- }
-}
-
-// handlePacketOutput manipulates ports for packets in Output hook.
-func handlePacketOutput(pkt *PacketBuffer, conn *conn, gso *GSO, r *Route, dir direction) {
- // If this is a noop entry, don't do anything.
- if conn.manip == manipNone {
- return
- }
-
- netHeader := pkt.Network()
- tcpHeader := header.TCP(pkt.TransportHeader().View())
-
- // For output redirection, packets going in the original direction
- // have their destinations modified and replies have their sources
- // modified. For prerouting redirection, we only reach this point
- // when replying, so packet sources are modified.
- if conn.manip == manipDstOutput && dir == dirOriginal {
- port := conn.reply.srcPort
- tcpHeader.SetDestinationPort(port)
- netHeader.SetDestinationAddress(conn.reply.srcAddr)
- } else {
- port := conn.original.dstPort
- tcpHeader.SetSourcePort(port)
- netHeader.SetSourceAddress(conn.original.dstAddr)
- }
-
- // Calculate the TCP checksum and set it.
- tcpHeader.SetChecksum(0)
- length := uint16(len(tcpHeader) + pkt.Data().Size())
- xsum := header.PseudoHeaderChecksum(header.TCPProtocolNumber, netHeader.SourceAddress(), netHeader.DestinationAddress(), length)
- if gso != nil && gso.NeedsCsum {
- tcpHeader.SetChecksum(xsum)
- } else if r.RequiresTXTransportChecksum() {
- xsum = header.ChecksumCombine(xsum, pkt.Data().AsRange().Checksum())
- tcpHeader.SetChecksum(^tcpHeader.CalculateChecksum(xsum))
- }
-
- if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
- netHeader := header.IPv4(pkt.NetworkHeader().View())
- netHeader.SetChecksum(0)
- netHeader.SetChecksum(^netHeader.CalculateChecksum())
- }
-}
-
// handlePacket will manipulate the port and address of the packet if the
// connection exists. Returns whether, after the packet traverses the tables,
// it should create a new entry in the table.
@@ -431,7 +377,9 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Rou
return false
}
- if hook != Prerouting && hook != Output {
+ switch hook {
+ case Prerouting, Input, Output, Postrouting:
+ default:
return false
}
@@ -441,23 +389,79 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Rou
}
conn, dir := ct.connFor(pkt)
- // Connection or Rule not found for the packet.
+ // Connection not found for the packet.
if conn == nil {
- return true
+ // If this is the last hook in the data path for this packet (Input if
+ // incoming, Postrouting if outgoing), indicate that a connection should be
+ // inserted by the end of this hook.
+ return hook == Input || hook == Postrouting
}
+ netHeader := pkt.Network()
tcpHeader := header.TCP(pkt.TransportHeader().View())
if len(tcpHeader) < header.TCPMinimumSize {
return false
}
+ // TODO(gvisor.dev/issue/5748): TCP checksums on inbound packets should be
+ // validated if checksum offloading is off. It may require IP defrag if the
+ // packets are fragmented.
+
+ switch hook {
+ case Prerouting, Output:
+ if conn.manip == manipDestination {
+ switch dir {
+ case dirOriginal:
+ tcpHeader.SetDestinationPort(conn.reply.srcPort)
+ netHeader.SetDestinationAddress(conn.reply.srcAddr)
+ case dirReply:
+ tcpHeader.SetSourcePort(conn.original.dstPort)
+ netHeader.SetSourceAddress(conn.original.dstAddr)
+ }
+ pkt.NatDone = true
+ }
+ case Input, Postrouting:
+ if conn.manip == manipSource {
+ switch dir {
+ case dirOriginal:
+ tcpHeader.SetSourcePort(conn.reply.dstPort)
+ netHeader.SetSourceAddress(conn.reply.dstAddr)
+ case dirReply:
+ tcpHeader.SetDestinationPort(conn.original.srcPort)
+ netHeader.SetDestinationAddress(conn.original.srcAddr)
+ }
+ pkt.NatDone = true
+ }
+ default:
+ panic(fmt.Sprintf("unrecognized hook = %s", hook))
+ }
+ if !pkt.NatDone {
+ return false
+ }
+
switch hook {
- case Prerouting:
- handlePacketPrerouting(pkt, conn, dir)
- case Output:
- handlePacketOutput(pkt, conn, gso, r, dir)
+ case Prerouting, Input:
+ case Output, Postrouting:
+ // Calculate the TCP checksum and set it.
+ tcpHeader.SetChecksum(0)
+ length := uint16(len(tcpHeader) + pkt.Data().Size())
+ xsum := header.PseudoHeaderChecksum(header.TCPProtocolNumber, netHeader.SourceAddress(), netHeader.DestinationAddress(), length)
+ if gso != nil && gso.NeedsCsum {
+ tcpHeader.SetChecksum(xsum)
+ } else if r.RequiresTXTransportChecksum() {
+ xsum = header.ChecksumCombine(xsum, pkt.Data().AsRange().Checksum())
+ tcpHeader.SetChecksum(^tcpHeader.CalculateChecksum(xsum))
+ }
+ default:
+ panic(fmt.Sprintf("unrecognized hook = %s", hook))
+ }
+
+ // After modification, IPv4 packets need a valid checksum.
+ if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
+ netHeader.SetChecksum(0)
+ netHeader.SetChecksum(^netHeader.CalculateChecksum())
}
- pkt.NatDone = true
// Update the state of tcb.
// TODO(gvisor.dev/issue/170): Add support in tcpcontrack to handle
@@ -638,8 +642,8 @@ func (ct *ConnTrack) originalDst(epID TransportEndpointID, netProto tcpip.Networ
if conn == nil {
// Not a tracked connection.
return "", 0, &tcpip.ErrNotConnected{}
- } else if conn.manip == manipNone {
- // Unmanipulated connection.
+ } else if conn.manip != manipDestination {
+ // Unmanipulated destination.
return "", 0, &tcpip.ErrInvalidOptionValue{}
}
diff --git a/pkg/tcpip/stack/hook_string.go b/pkg/tcpip/stack/hook_string.go
new file mode 100644
index 000000000..3dc8a7b02
--- /dev/null
+++ b/pkg/tcpip/stack/hook_string.go
@@ -0,0 +1,41 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at //
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by "stringer -type Hook ."; DO NOT EDIT.
+
+package stack
+
+import "strconv"
+
+func _() {
+ // An "invalid array index" compiler error signifies that the constant values have changed.
+ // Re-run the stringer command to generate them again.
+ var x [1]struct{}
+ _ = x[Prerouting-0]
+ _ = x[Input-1]
+ _ = x[Forward-2]
+ _ = x[Output-3]
+ _ = x[Postrouting-4]
+ _ = x[NumHooks-5]
+}
+
+const _Hook_name = "PreroutingInputForwardOutputPostroutingNumHooks"
+
+var _Hook_index = [...]uint8{0, 10, 15, 22, 28, 39, 47}
+
+func (i Hook) String() string {
+ if i >= Hook(len(_Hook_index)-1) {
+ return "Hook(" + strconv.FormatInt(int64(i), 10) + ")"
+ }
+ return _Hook_name[_Hook_index[i]:_Hook_index[i+1]]
+}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 52890f6eb..7ea87d325 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -175,9 +175,10 @@ func DefaultTables() *IPTables {
},
},
priorities: [NumHooks][]TableID{
- Prerouting: {MangleID, NATID},
- Input: {NATID, FilterID},
- Output: {MangleID, NATID, FilterID},
+ Prerouting: {MangleID, NATID},
+ Input: {NATID, FilterID},
+ Output: {MangleID, NATID, FilterID},
+ Postrouting: {MangleID, NATID},
},
connections: ConnTrack{
seed: generateRandUint32(),
diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go
index 0e8b90c9b..317efe754 100644
--- a/pkg/tcpip/stack/iptables_targets.go
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -182,3 +182,81 @@ func (rt *RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gs
return RuleAccept, 0
}
+
+// SNATTarget modifies the source port/IP in the outgoing packets.
+type SNATTarget struct {
+ Addr tcpip.Address
+ Port uint16
+
+ // NetworkProtocol is the network protocol the target is used with. It
+ // is immutable.
+ NetworkProtocol tcpip.NetworkProtocolNumber
+}
+
+// Action implements Target.Action.
+func (st *SNATTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int) {
+ // Sanity check.
+ if st.NetworkProtocol != pkt.NetworkProtocolNumber {
+ panic(fmt.Sprintf(
+ "SNATTarget.Action with NetworkProtocol %d called on packet with NetworkProtocolNumber %d",
+ st.NetworkProtocol, pkt.NetworkProtocolNumber))
+ }
+
+ // Packet is already manipulated.
+ if pkt.NatDone {
+ return RuleAccept, 0
+ }
+
+ // Drop the packet if network and transport header are not set.
+ if pkt.NetworkHeader().View().IsEmpty() || pkt.TransportHeader().View().IsEmpty() {
+ return RuleDrop, 0
+ }
+
+ switch hook {
+ case Postrouting, Input:
+ case Prerouting, Output, Forward:
+ panic(fmt.Sprintf("%s not supported", hook))
+ default:
+ panic(fmt.Sprintf("%s unrecognized", hook))
+ }
+
+ switch protocol := pkt.TransportProtocolNumber; protocol {
+ case header.UDPProtocolNumber:
+ udpHeader := header.UDP(pkt.TransportHeader().View())
+ udpHeader.SetChecksum(0)
+ udpHeader.SetSourcePort(st.Port)
+ netHeader := pkt.Network()
+ netHeader.SetSourceAddress(st.Addr)
+
+ // Only calculate the checksum if offloading isn't supported.
+ if r.RequiresTXTransportChecksum() {
+ length := uint16(pkt.Size()) - uint16(len(pkt.NetworkHeader().View()))
+ xsum := header.PseudoHeaderChecksum(protocol, netHeader.SourceAddress(), netHeader.DestinationAddress(), length)
+ xsum = header.ChecksumCombine(xsum, pkt.Data().AsRange().Checksum())
+ udpHeader.SetChecksum(^udpHeader.CalculateChecksum(xsum))
+ }
+
+ // After modification, IPv4 packets need a valid checksum.
+ if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber {
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
+ netHeader.SetChecksum(0)
+ netHeader.SetChecksum(^netHeader.CalculateChecksum())
+ }
+ pkt.NatDone = true
+ case header.TCPProtocolNumber:
+ if ct == nil {
+ return RuleAccept, 0
+ }
+
+ // Set up conection for matching NAT rule. Only the first
+ // packet of the connection comes here. Other packets will be
+ // manipulated in connection tracking.
+ if conn := ct.insertSNATConn(pkt, hook, st.Port, st.Addr); conn != nil {
+ ct.handlePacket(pkt, hook, gso, r)
+ }
+ default:
+ return RuleDrop, 0
+ }
+
+ return RuleAccept, 0
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 14124ae66..b6cf24739 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -33,15 +33,19 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/tcpip/testutil"
"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
"gvisor.dev/gvisor/pkg/waiter"
)
+var (
+ addr1 = testutil.MustParse6("a00::1")
+ addr2 = testutil.MustParse6("a00::2")
+ addr3 = testutil.MustParse6("a00::3")
+)
+
const (
- addr1 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
- addr2 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
- addr3 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03")
linkAddr1 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
linkAddr2 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x07")
linkAddr3 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08")
@@ -1390,7 +1394,7 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
// configured not to.
func TestNoPrefixDiscovery(t *testing.T) {
prefix := tcpip.AddressWithPrefix{
- Address: tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
+ Address: testutil.MustParse6("102:304:506:708::"),
PrefixLen: 64,
}
@@ -1590,7 +1594,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
}()
prefix := tcpip.AddressWithPrefix{
- Address: tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
+ Address: testutil.MustParse6("102:304:506:708::"),
PrefixLen: 64,
}
subnet := prefix.Subnet()
@@ -5204,13 +5208,13 @@ func TestRouterSolicitation(t *testing.T) {
}
// Make sure the right remote link address is used.
- if want := header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersMulticastAddress); p.Route.RemoteLinkAddress != want {
+ if want := header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersLinkLocalMulticastAddress); p.Route.RemoteLinkAddress != want {
t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want)
}
checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
checker.SrcAddr(test.expectedSrcAddr),
- checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+ checker.DstAddr(header.IPv6AllRoutersLinkLocalMulticastAddress),
checker.TTL(header.NDPHopLimit),
checker.NDPRS(checker.NDPRSOptions(test.expectedNDPOpts)),
)
@@ -5362,7 +5366,7 @@ func TestStopStartSolicitingRouters(t *testing.T) {
}
checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
checker.SrcAddr(header.IPv6Any),
- checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+ checker.DstAddr(header.IPv6AllRoutersLinkLocalMulticastAddress),
checker.TTL(header.NDPHopLimit),
checker.NDPRS())
}
diff --git a/pkg/tcpip/stack/neighbor_entry_test.go b/pkg/tcpip/stack/neighbor_entry_test.go
index bb2b2d705..1d39ee73d 100644
--- a/pkg/tcpip/stack/neighbor_entry_test.go
+++ b/pkg/tcpip/stack/neighbor_entry_test.go
@@ -26,14 +26,13 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/faketime"
"gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/testutil"
)
const (
entryTestNetNumber tcpip.NetworkProtocolNumber = math.MaxUint32
entryTestNICID tcpip.NICID = 1
- entryTestAddr1 = tcpip.Address("\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
- entryTestAddr2 = tcpip.Address("\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
entryTestLinkAddr1 = tcpip.LinkAddress("\x0a\x00\x00\x00\x00\x01")
entryTestLinkAddr2 = tcpip.LinkAddress("\x0a\x00\x00\x00\x00\x02")
@@ -44,6 +43,11 @@ const (
entryTestNetDefaultMTU = 65536
)
+var (
+ entryTestAddr1 = testutil.MustParse6("a::1")
+ entryTestAddr2 = testutil.MustParse6("a::2")
+)
+
// runImmediatelyScheduledJobs runs all jobs scheduled to run at the current
// time.
func runImmediatelyScheduledJobs(clock *faketime.ManualClock) {
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index 8f288675d..c10304d5f 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -299,9 +299,18 @@ func (pk *PacketBuffer) Network() header.Network {
// See PacketBuffer.Data for details about how a packet buffer holds an inbound
// packet.
func (pk *PacketBuffer) CloneToInbound() *PacketBuffer {
- return NewPacketBuffer(PacketBufferOptions{
+ newPk := NewPacketBuffer(PacketBufferOptions{
Data: buffer.NewVectorisedView(pk.Size(), pk.Views()),
})
+ // TODO(gvisor.dev/issue/5696): reimplement conntrack so that no need to
+ // maintain this flag in the packet. Currently conntrack needs this flag to
+ // tell if a noop connection should be inserted at Input hook. Once conntrack
+ // redefines the manipulation field as mutable, we won't need the special noop
+ // connection.
+ if pk.NatDone {
+ newPk.NatDone = true
+ }
+ return newPk
}
// headerInfo stores metadata about a header in a packet.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 39344808d..4ae6bed5a 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -132,7 +132,7 @@ func constructAndValidateRoute(netProto tcpip.NetworkProtocolNumber, addressEndp
localAddr = addressEndpoint.AddressWithPrefix().Address
}
- if localAddressNIC != outgoingNIC && header.IsV6LinkLocalAddress(localAddr) {
+ if localAddressNIC != outgoingNIC && header.IsV6LinkLocalUnicastAddress(localAddr) {
addressEndpoint.DecRef()
return nil
}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 931a97ddc..21cfbad71 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -35,7 +35,6 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/ports"
- "gvisor.dev/gvisor/pkg/tcpip/seqnum"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -56,306 +55,6 @@ type transportProtocolState struct {
defaultHandler func(id TransportEndpointID, pkt *PacketBuffer) bool
}
-// TCPProbeFunc is the expected function type for a TCP probe function to be
-// passed to stack.AddTCPProbe.
-type TCPProbeFunc func(s TCPEndpointState)
-
-// TCPCubicState is used to hold a copy of the internal cubic state when the
-// TCPProbeFunc is invoked.
-type TCPCubicState struct {
- WLastMax float64
- WMax float64
- T time.Time
- TimeSinceLastCongestion time.Duration
- C float64
- K float64
- Beta float64
- WC float64
- WEst float64
-}
-
-// TCPRACKState is used to hold a copy of the internal RACK state when the
-// TCPProbeFunc is invoked.
-type TCPRACKState struct {
- XmitTime time.Time
- EndSequence seqnum.Value
- FACK seqnum.Value
- RTT time.Duration
- Reord bool
- DSACKSeen bool
- ReoWnd time.Duration
- ReoWndIncr uint8
- ReoWndPersist int8
- RTTSeq seqnum.Value
-}
-
-// TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
-type TCPEndpointID struct {
- // LocalPort is the local port associated with the endpoint.
- LocalPort uint16
-
- // LocalAddress is the local [network layer] address associated with
- // the endpoint.
- LocalAddress tcpip.Address
-
- // RemotePort is the remote port associated with the endpoint.
- RemotePort uint16
-
- // RemoteAddress it the remote [network layer] address associated with
- // the endpoint.
- RemoteAddress tcpip.Address
-}
-
-// TCPFastRecoveryState holds a copy of the internal fast recovery state of a
-// TCP endpoint.
-type TCPFastRecoveryState struct {
- // Active if true indicates the endpoint is in fast recovery.
- Active bool
-
- // First is the first unacknowledged sequence number being recovered.
- First seqnum.Value
-
- // Last is the 'recover' sequence number that indicates the point at
- // which we should exit recovery barring any timeouts etc.
- Last seqnum.Value
-
- // MaxCwnd is the maximum value we are permitted to grow the congestion
- // window during recovery. This is set at the time we enter recovery.
- MaxCwnd int
-
- // HighRxt is the highest sequence number which has been retransmitted
- // during the current loss recovery phase.
- // See: RFC 6675 Section 2 for details.
- HighRxt seqnum.Value
-
- // RescueRxt is the highest sequence number which has been
- // optimistically retransmitted to prevent stalling of the ACK clock
- // when there is loss at the end of the window and no new data is
- // available for transmission.
- // See: RFC 6675 Section 2 for details.
- RescueRxt seqnum.Value
-}
-
-// TCPReceiverState holds a copy of the internal state of the receiver for
-// a given TCP endpoint.
-type TCPReceiverState struct {
- // RcvNxt is the TCP variable RCV.NXT.
- RcvNxt seqnum.Value
-
- // RcvAcc is the TCP variable RCV.ACC.
- RcvAcc seqnum.Value
-
- // RcvWndScale is the window scaling to use for inbound segments.
- RcvWndScale uint8
-
- // PendingBufUsed is the number of bytes pending in the receive
- // queue.
- PendingBufUsed int
-}
-
-// TCPSenderState holds a copy of the internal state of the sender for
-// a given TCP Endpoint.
-type TCPSenderState struct {
- // LastSendTime is the time at which we sent the last segment.
- LastSendTime time.Time
-
- // DupAckCount is the number of Duplicate ACK's received.
- DupAckCount int
-
- // SndCwnd is the size of the sending congestion window in packets.
- SndCwnd int
-
- // Ssthresh is the slow start threshold in packets.
- Ssthresh int
-
- // SndCAAckCount is the number of packets consumed in congestion
- // avoidance mode.
- SndCAAckCount int
-
- // Outstanding is the number of packets in flight.
- Outstanding int
-
- // SackedOut is the number of packets which have been selectively acked.
- SackedOut int
-
- // SndWnd is the send window size in bytes.
- SndWnd seqnum.Size
-
- // SndUna is the next unacknowledged sequence number.
- SndUna seqnum.Value
-
- // SndNxt is the sequence number of the next segment to be sent.
- SndNxt seqnum.Value
-
- // RTTMeasureSeqNum is the sequence number being used for the latest RTT
- // measurement.
- RTTMeasureSeqNum seqnum.Value
-
- // RTTMeasureTime is the time when the RTTMeasureSeqNum was sent.
- RTTMeasureTime time.Time
-
- // Closed indicates that the caller has closed the endpoint for sending.
- Closed bool
-
- // SRTT is the smoothed round-trip time as defined in section 2 of
- // RFC 6298.
- SRTT time.Duration
-
- // RTO is the retransmit timeout as defined in section of 2 of RFC 6298.
- RTO time.Duration
-
- // RTTVar is the round-trip time variation as defined in section 2 of
- // RFC 6298.
- RTTVar time.Duration
-
- // SRTTInited if true indicates take a valid RTT measurement has been
- // completed.
- SRTTInited bool
-
- // MaxPayloadSize is the maximum size of the payload of a given segment.
- // It is initialized on demand.
- MaxPayloadSize int
-
- // SndWndScale is the number of bits to shift left when reading the send
- // window size from a segment.
- SndWndScale uint8
-
- // MaxSentAck is the highest acknowledgement number sent till now.
- MaxSentAck seqnum.Value
-
- // FastRecovery holds the fast recovery state for the endpoint.
- FastRecovery TCPFastRecoveryState
-
- // Cubic holds the state related to CUBIC congestion control.
- Cubic TCPCubicState
-
- // RACKState holds the state related to RACK loss detection algorithm.
- RACKState TCPRACKState
-}
-
-// TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
-type TCPSACKInfo struct {
- // Blocks is the list of SACK Blocks that identify the out of order segments
- // held by a given TCP endpoint.
- Blocks []header.SACKBlock
-
- // ReceivedBlocks are the SACK blocks received by this endpoint
- // from the peer endpoint.
- ReceivedBlocks []header.SACKBlock
-
- // MaxSACKED is the highest sequence number that has been SACKED
- // by the peer.
- MaxSACKED seqnum.Value
-}
-
-// RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning.
-type RcvBufAutoTuneParams struct {
- // MeasureTime is the time at which the current measurement
- // was started.
- MeasureTime time.Time
-
- // CopiedBytes is the number of bytes copied to user space since
- // this measure began.
- CopiedBytes int
-
- // PrevCopiedBytes is the number of bytes copied to userspace in
- // the previous RTT period.
- PrevCopiedBytes int
-
- // RcvBufSize is the auto tuned receive buffer size.
- RcvBufSize int
-
- // RTT is the smoothed RTT as measured by observing the time between
- // when a byte is first acknowledged and the receipt of data that is at
- // least one window beyond the sequence number that was acknowledged.
- RTT time.Duration
-
- // RTTVar is the "round-trip time variation" as defined in section 2
- // of RFC6298.
- RTTVar time.Duration
-
- // RTTMeasureSeqNumber is the highest acceptable sequence number at the
- // time this RTT measurement period began.
- RTTMeasureSeqNumber seqnum.Value
-
- // RTTMeasureTime is the absolute time at which the current RTT
- // measurement period began.
- RTTMeasureTime time.Time
-
- // Disabled is true if an explicit receive buffer is set for the
- // endpoint.
- Disabled bool
-}
-
-// TCPEndpointState is a copy of the internal state of a TCP endpoint.
-type TCPEndpointState struct {
- // ID is a copy of the TransportEndpointID for the endpoint.
- ID TCPEndpointID
-
- // SegTime denotes the absolute time when this segment was received.
- SegTime time.Time
-
- // RcvBufSize is the size of the receive socket buffer for the endpoint.
- RcvBufSize int
-
- // RcvBufUsed is the amount of bytes actually held in the receive socket
- // buffer for the endpoint.
- RcvBufUsed int
-
- // RcvBufAutoTuneParams is used to hold state variables to compute
- // the auto tuned receive buffer size.
- RcvAutoParams RcvBufAutoTuneParams
-
- // RcvClosed if true, indicates the endpoint has been closed for reading.
- RcvClosed bool
-
- // SendTSOk is used to indicate when the TS Option has been negotiated.
- // When sendTSOk is true every non-RST segment should carry a TS as per
- // RFC7323#section-1.1.
- SendTSOk bool
-
- // RecentTS is the timestamp that should be sent in the TSEcr field of
- // the timestamp for future segments sent by the endpoint. This field is
- // updated if required when a new segment is received by this endpoint.
- RecentTS uint32
-
- // TSOffset is a randomized offset added to the value of the TSVal field
- // in the timestamp option.
- TSOffset uint32
-
- // SACKPermitted is set to true if the peer sends the TCPSACKPermitted
- // option in the SYN/SYN-ACK.
- SACKPermitted bool
-
- // SACK holds TCP SACK related information for this endpoint.
- SACK TCPSACKInfo
-
- // SndBufSize is the size of the socket send buffer.
- SndBufSize int
-
- // SndBufUsed is the number of bytes held in the socket send buffer.
- SndBufUsed int
-
- // SndClosed indicates that the endpoint has been closed for sends.
- SndClosed bool
-
- // SndBufInQueue is the number of bytes in the send queue.
- SndBufInQueue seqnum.Size
-
- // PacketTooBigCount is used to notify the main protocol routine how
- // many times a "packet too big" control packet is received.
- PacketTooBigCount int
-
- // SndMTU is the smallest MTU seen in the control packets received.
- SndMTU int
-
- // Receiver holds variables related to the TCP receiver for the endpoint.
- Receiver TCPReceiverState
-
- // Sender holds state related to the TCP Sender for the endpoint.
- Sender TCPSenderState
-}
-
// ResumableEndpoint is an endpoint that needs to be resumed after restore.
type ResumableEndpoint interface {
// Resume resumes an endpoint after restore. This can be used to restart
@@ -455,7 +154,7 @@ type Stack struct {
// receiveBufferSize holds the min/default/max receive buffer sizes for
// endpoints other than TCP.
- receiveBufferSize ReceiveBufferSizeOption
+ receiveBufferSize tcpip.ReceiveBufferSizeOption
// tcpInvalidRateLimit is the maximal rate for sending duplicate
// acknowledgements in response to incoming TCP packets that are for an existing
@@ -669,7 +368,7 @@ func New(opts Options) *Stack {
Default: DefaultBufferSize,
Max: DefaultMaxBufferSize,
},
- receiveBufferSize: ReceiveBufferSizeOption{
+ receiveBufferSize: tcpip.ReceiveBufferSizeOption{
Min: MinBufferSize,
Default: DefaultBufferSize,
Max: DefaultMaxBufferSize,
@@ -1344,7 +1043,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
s.mu.RLock()
defer s.mu.RUnlock()
- isLinkLocal := header.IsV6LinkLocalAddress(remoteAddr) || header.IsV6LinkLocalMulticastAddress(remoteAddr)
+ isLinkLocal := header.IsV6LinkLocalUnicastAddress(remoteAddr) || header.IsV6LinkLocalMulticastAddress(remoteAddr)
isLocalBroadcast := remoteAddr == header.IPv4Broadcast
isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)
isLoopback := header.IsV4LoopbackAddress(remoteAddr) || header.IsV6LoopbackAddress(remoteAddr)
@@ -1381,7 +1080,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
return nil, &tcpip.ErrNetworkUnreachable{}
}
- canForward := s.Forwarding(netProto) && !header.IsV6LinkLocalAddress(localAddr) && !isLinkLocal
+ canForward := s.Forwarding(netProto) && !header.IsV6LinkLocalUnicastAddress(localAddr) && !isLinkLocal
// Find a route to the remote with the route table.
var chosenRoute tcpip.Route
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
index dfec4258a..33824afd0 100644
--- a/pkg/tcpip/stack/stack_global_state.go
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -14,6 +14,78 @@
package stack
+import "time"
+
// StackFromEnv is the global stack created in restore run.
// FIXME(b/36201077)
var StackFromEnv *Stack
+
+// saveT is invoked by stateify.
+func (t *TCPCubicState) saveT() unixTime {
+ return unixTime{t.T.Unix(), t.T.UnixNano()}
+}
+
+// loadT is invoked by stateify.
+func (t *TCPCubicState) loadT(unix unixTime) {
+ t.T = time.Unix(unix.second, unix.nano)
+}
+
+// saveXmitTime is invoked by stateify.
+func (t *TCPRACKState) saveXmitTime() unixTime {
+ return unixTime{t.XmitTime.Unix(), t.XmitTime.UnixNano()}
+}
+
+// loadXmitTime is invoked by stateify.
+func (t *TCPRACKState) loadXmitTime(unix unixTime) {
+ t.XmitTime = time.Unix(unix.second, unix.nano)
+}
+
+// saveLastSendTime is invoked by stateify.
+func (t *TCPSenderState) saveLastSendTime() unixTime {
+ return unixTime{t.LastSendTime.Unix(), t.LastSendTime.UnixNano()}
+}
+
+// loadLastSendTime is invoked by stateify.
+func (t *TCPSenderState) loadLastSendTime(unix unixTime) {
+ t.LastSendTime = time.Unix(unix.second, unix.nano)
+}
+
+// saveRTTMeasureTime is invoked by stateify.
+func (t *TCPSenderState) saveRTTMeasureTime() unixTime {
+ return unixTime{t.RTTMeasureTime.Unix(), t.RTTMeasureTime.UnixNano()}
+}
+
+// loadRTTMeasureTime is invoked by stateify.
+func (t *TCPSenderState) loadRTTMeasureTime(unix unixTime) {
+ t.RTTMeasureTime = time.Unix(unix.second, unix.nano)
+}
+
+// saveMeasureTime is invoked by stateify.
+func (r *RcvBufAutoTuneParams) saveMeasureTime() unixTime {
+ return unixTime{r.MeasureTime.Unix(), r.MeasureTime.UnixNano()}
+}
+
+// loadMeasureTime is invoked by stateify.
+func (r *RcvBufAutoTuneParams) loadMeasureTime(unix unixTime) {
+ r.MeasureTime = time.Unix(unix.second, unix.nano)
+}
+
+// saveRTTMeasureTime is invoked by stateify.
+func (r *RcvBufAutoTuneParams) saveRTTMeasureTime() unixTime {
+ return unixTime{r.RTTMeasureTime.Unix(), r.RTTMeasureTime.UnixNano()}
+}
+
+// loadRTTMeasureTime is invoked by stateify.
+func (r *RcvBufAutoTuneParams) loadRTTMeasureTime(unix unixTime) {
+ r.RTTMeasureTime = time.Unix(unix.second, unix.nano)
+}
+
+// saveSegTime is invoked by stateify.
+func (t *TCPEndpointState) saveSegTime() unixTime {
+ return unixTime{t.SegTime.Unix(), t.SegTime.UnixNano()}
+}
+
+// loadSegTime is invoked by stateify.
+func (t *TCPEndpointState) loadSegTime(unix unixTime) {
+ t.SegTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/stack/stack_options.go b/pkg/tcpip/stack/stack_options.go
index 3066f4ffd..80e8e0089 100644
--- a/pkg/tcpip/stack/stack_options.go
+++ b/pkg/tcpip/stack/stack_options.go
@@ -68,7 +68,7 @@ func (s *Stack) SetOption(option interface{}) tcpip.Error {
s.mu.Unlock()
return nil
- case ReceiveBufferSizeOption:
+ case tcpip.ReceiveBufferSizeOption:
// Make sure we don't allow lowering the buffer below minimum
// required for stack to work.
if v.Min < MinBufferSize {
@@ -107,7 +107,7 @@ func (s *Stack) Option(option interface{}) tcpip.Error {
s.mu.RUnlock()
return nil
- case *ReceiveBufferSizeOption:
+ case *tcpip.ReceiveBufferSizeOption:
s.mu.RLock()
*v = s.receiveBufferSize
s.mu.RUnlock()
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 2814b94b4..a0bd69d9a 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -39,6 +39,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/tcpip/testutil"
"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
)
@@ -1645,10 +1646,10 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) {
defaultAddr := tcpip.AddressWithPrefix{header.IPv4Any, 0}
// Local subnet on NIC1: 192.168.1.58/24, gateway 192.168.1.1.
nic1Addr := tcpip.AddressWithPrefix{"\xc0\xa8\x01\x3a", 24}
- nic1Gateway := tcpip.Address("\xc0\xa8\x01\x01")
+ nic1Gateway := testutil.MustParse4("192.168.1.1")
// Local subnet on NIC2: 10.10.10.5/24, gateway 10.10.10.1.
nic2Addr := tcpip.AddressWithPrefix{"\x0a\x0a\x0a\x05", 24}
- nic2Gateway := tcpip.Address("\x0a\x0a\x0a\x01")
+ nic2Gateway := testutil.MustParse4("10.10.10.1")
// Create a new stack with two NICs.
s := stack.New(stack.Options{
@@ -2789,25 +2790,27 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
const (
- linkLocalAddr1 = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
- linkLocalAddr2 = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
- linkLocalMulticastAddr = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
- uniqueLocalAddr1 = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
- uniqueLocalAddr2 = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
- globalAddr1 = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
- globalAddr2 = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
- globalAddr3 = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03")
- ipv4MappedIPv6Addr1 = tcpip.Address("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x01")
- ipv4MappedIPv6Addr2 = tcpip.Address("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x02")
- toredoAddr1 = tcpip.Address("\x20\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
- toredoAddr2 = tcpip.Address("\x20\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
- ipv6ToIPv4Addr1 = tcpip.Address("\x20\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
- ipv6ToIPv4Addr2 = tcpip.Address("\x20\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-
nicID = 1
lifetimeSeconds = 9999
)
+ var (
+ linkLocalAddr1 = testutil.MustParse6("fe80::1")
+ linkLocalAddr2 = testutil.MustParse6("fe80::2")
+ linkLocalMulticastAddr = testutil.MustParse6("ff02::1")
+ uniqueLocalAddr1 = testutil.MustParse6("fc00::1")
+ uniqueLocalAddr2 = testutil.MustParse6("fd00::2")
+ globalAddr1 = testutil.MustParse6("a000::1")
+ globalAddr2 = testutil.MustParse6("a000::2")
+ globalAddr3 = testutil.MustParse6("a000::3")
+ ipv4MappedIPv6Addr1 = testutil.MustParse6("::ffff:0.0.0.1")
+ ipv4MappedIPv6Addr2 = testutil.MustParse6("::ffff:0.0.0.2")
+ toredoAddr1 = testutil.MustParse6("2001::1")
+ toredoAddr2 = testutil.MustParse6("2001::2")
+ ipv6ToIPv4Addr1 = testutil.MustParse6("2002::1")
+ ipv6ToIPv4Addr2 = testutil.MustParse6("2002::2")
+ )
+
prefix1, _, stableGlobalAddr1 := prefixSubnetAddr(0, linkAddr1)
prefix2, _, stableGlobalAddr2 := prefixSubnetAddr(1, linkAddr1)
@@ -3354,21 +3357,21 @@ func TestStackReceiveBufferSizeOption(t *testing.T) {
const sMin = stack.MinBufferSize
testCases := []struct {
name string
- rs stack.ReceiveBufferSizeOption
+ rs tcpip.ReceiveBufferSizeOption
err tcpip.Error
}{
// Invalid configurations.
- {"min_below_zero", stack.ReceiveBufferSizeOption{Min: -1, Default: sMin, Max: sMin}, &tcpip.ErrInvalidOptionValue{}},
- {"min_zero", stack.ReceiveBufferSizeOption{Min: 0, Default: sMin, Max: sMin}, &tcpip.ErrInvalidOptionValue{}},
- {"default_below_min", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin - 1, Max: sMin - 1}, &tcpip.ErrInvalidOptionValue{}},
- {"default_above_max", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin}, &tcpip.ErrInvalidOptionValue{}},
- {"max_below_min", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin - 1}, &tcpip.ErrInvalidOptionValue{}},
+ {"min_below_zero", tcpip.ReceiveBufferSizeOption{Min: -1, Default: sMin, Max: sMin}, &tcpip.ErrInvalidOptionValue{}},
+ {"min_zero", tcpip.ReceiveBufferSizeOption{Min: 0, Default: sMin, Max: sMin}, &tcpip.ErrInvalidOptionValue{}},
+ {"default_below_min", tcpip.ReceiveBufferSizeOption{Min: sMin, Default: sMin - 1, Max: sMin - 1}, &tcpip.ErrInvalidOptionValue{}},
+ {"default_above_max", tcpip.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin}, &tcpip.ErrInvalidOptionValue{}},
+ {"max_below_min", tcpip.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin - 1}, &tcpip.ErrInvalidOptionValue{}},
// Valid Configurations
- {"in_ascending_order", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 2}, nil},
- {"all_equal", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin, Max: sMin}, nil},
- {"min_default_equal", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin, Max: sMin + 1}, nil},
- {"default_max_equal", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 1}, nil},
+ {"in_ascending_order", tcpip.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 2}, nil},
+ {"all_equal", tcpip.ReceiveBufferSizeOption{Min: sMin, Default: sMin, Max: sMin}, nil},
+ {"min_default_equal", tcpip.ReceiveBufferSizeOption{Min: sMin, Default: sMin, Max: sMin + 1}, nil},
+ {"default_max_equal", tcpip.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 1}, nil},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
@@ -3377,7 +3380,7 @@ func TestStackReceiveBufferSizeOption(t *testing.T) {
if err := s.SetOption(tc.rs); err != tc.err {
t.Fatalf("s.SetOption(%#v) = %v, want: %v", tc.rs, err, tc.err)
}
- var rs stack.ReceiveBufferSizeOption
+ var rs tcpip.ReceiveBufferSizeOption
if tc.err == nil {
if err := s.Option(&rs); err != nil {
t.Fatalf("s.Option(%#v) = %v, want: nil", rs, err)
@@ -3448,7 +3451,7 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
}
ipv4Subnet := ipv4Addr.Subnet()
ipv4SubnetBcast := ipv4Subnet.Broadcast()
- ipv4Gateway := tcpip.Address("\xc0\xa8\x01\x01")
+ ipv4Gateway := testutil.MustParse4("192.168.1.1")
ipv4AddrPrefix31 := tcpip.AddressWithPrefix{
Address: "\xc0\xa8\x01\x3a",
PrefixLen: 31,
@@ -4352,13 +4355,15 @@ func TestWritePacketToRemote(t *testing.T) {
func TestClearNeighborCacheOnNICDisable(t *testing.T) {
const (
- nicID = 1
-
- ipv4Addr = tcpip.Address("\x01\x02\x03\x04")
- ipv6Addr = tcpip.Address("\x01\x02\x03\x04\x01\x02\x03\x04\x01\x02\x03\x04\x01\x02\x03\x04")
+ nicID = 1
linkAddr = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
)
+ var (
+ ipv4Addr = testutil.MustParse4("1.2.3.4")
+ ipv6Addr = testutil.MustParse6("102:304:102:304:102:304:102:304")
+ )
+
clock := faketime.NewManualClock()
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocolFactory{arp.NewProtocol, ipv4.NewProtocol, ipv6.NewProtocol},
diff --git a/pkg/tcpip/stack/tcp.go b/pkg/tcpip/stack/tcp.go
new file mode 100644
index 000000000..ddff6e2d6
--- /dev/null
+++ b/pkg/tcpip/stack/tcp.go
@@ -0,0 +1,451 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+ "time"
+
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+// TCPProbeFunc is the expected function type for a TCP probe function to be
+// passed to stack.AddTCPProbe.
+type TCPProbeFunc func(s TCPEndpointState)
+
+// TCPCubicState is used to hold a copy of the internal cubic state when the
+// TCPProbeFunc is invoked.
+//
+// +stateify savable
+type TCPCubicState struct {
+ // WLastMax is the previous wMax value.
+ WLastMax float64
+
+ // WMax is the value of the congestion window at the time of the last
+ // congestion event.
+ WMax float64
+
+ // T is the time when the current congestion avoidance was entered.
+ T time.Time `state:".(unixTime)"`
+
+ // TimeSinceLastCongestion denotes the time since the current
+ // congestion avoidance was entered.
+ TimeSinceLastCongestion time.Duration
+
+ // C is the cubic constant as specified in RFC8312, page 11.
+ C float64
+
+ // K is the time period (in seconds) that the above function takes to
+ // increase the current window size to WMax if there are no further
+ // congestion events and is calculated using the following equation:
+ //
+ // K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5)
+ K float64
+
+ // Beta is the CUBIC multiplication decrease factor. That is, when a
+ // congestion event is detected, CUBIC reduces its cwnd to
+ // WC(0)=WMax*beta_cubic.
+ Beta float64
+
+ // WC is window computed by CUBIC at time TimeSinceLastCongestion. It's
+ // calculated using the formula:
+ //
+ // WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1)
+ WC float64
+
+ // WEst is the window computed by CUBIC at time
+ // TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT).
+ WEst float64
+}
+
+// TCPRACKState is used to hold a copy of the internal RACK state when the
+// TCPProbeFunc is invoked.
+//
+// +stateify savable
+type TCPRACKState struct {
+ // XmitTime is the transmission timestamp of the most recent
+ // acknowledged segment.
+ XmitTime time.Time `state:".(unixTime)"`
+
+ // EndSequence is the ending TCP sequence number of the most recent
+ // acknowledged segment.
+ EndSequence seqnum.Value
+
+ // FACK is the highest selectively or cumulatively acknowledged
+ // sequence.
+ FACK seqnum.Value
+
+ // RTT is the round trip time of the most recently delivered packet on
+ // the connection (either cumulatively acknowledged or selectively
+ // acknowledged) that was not marked invalid as a possible spurious
+ // retransmission.
+ RTT time.Duration
+
+ // Reord is true iff reordering has been detected on this connection.
+ Reord bool
+
+ // DSACKSeen is true iff the connection has seen a DSACK.
+ DSACKSeen bool
+
+ // ReoWnd is the reordering window time used for recording packet
+ // transmission times. It is used to defer the moment at which RACK
+ // marks a packet lost.
+ ReoWnd time.Duration
+
+ // ReoWndIncr is the multiplier applied to adjust reorder window.
+ ReoWndIncr uint8
+
+ // ReoWndPersist is the number of loss recoveries before resetting
+ // reorder window.
+ ReoWndPersist int8
+
+ // RTTSeq is the SND.NXT when RTT is updated.
+ RTTSeq seqnum.Value
+}
+
+// TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
+//
+// +stateify savable
+type TCPEndpointID struct {
+ // LocalPort is the local port associated with the endpoint.
+ LocalPort uint16
+
+ // LocalAddress is the local [network layer] address associated with
+ // the endpoint.
+ LocalAddress tcpip.Address
+
+ // RemotePort is the remote port associated with the endpoint.
+ RemotePort uint16
+
+ // RemoteAddress it the remote [network layer] address associated with
+ // the endpoint.
+ RemoteAddress tcpip.Address
+}
+
+// TCPFastRecoveryState holds a copy of the internal fast recovery state of a
+// TCP endpoint.
+//
+// +stateify savable
+type TCPFastRecoveryState struct {
+ // Active if true indicates the endpoint is in fast recovery. The
+ // following fields are only meaningful when Active is true.
+ Active bool
+
+ // First is the first unacknowledged sequence number being recovered.
+ First seqnum.Value
+
+ // Last is the 'recover' sequence number that indicates the point at
+ // which we should exit recovery barring any timeouts etc.
+ Last seqnum.Value
+
+ // MaxCwnd is the maximum value we are permitted to grow the congestion
+ // window during recovery. This is set at the time we enter recovery.
+ // It exists to avoid attacks where the receiver intentionally sends
+ // duplicate acks to artificially inflate the sender's cwnd.
+ MaxCwnd int
+
+ // HighRxt is the highest sequence number which has been retransmitted
+ // during the current loss recovery phase. See: RFC 6675 Section 2 for
+ // details.
+ HighRxt seqnum.Value
+
+ // RescueRxt is the highest sequence number which has been
+ // optimistically retransmitted to prevent stalling of the ACK clock
+ // when there is loss at the end of the window and no new data is
+ // available for transmission. See: RFC 6675 Section 2 for details.
+ RescueRxt seqnum.Value
+}
+
+// TCPReceiverState holds a copy of the internal state of the receiver for a
+// given TCP endpoint.
+//
+// +stateify savable
+type TCPReceiverState struct {
+ // RcvNxt is the TCP variable RCV.NXT.
+ RcvNxt seqnum.Value
+
+ // RcvAcc is one beyond the last acceptable sequence number. That is,
+ // the "largest" sequence value that the receiver has announced to its
+ // peer that it's willing to accept. This may be different than RcvNxt
+ // + (last advertised receive window) if the receive window is reduced;
+ // in that case we have to reduce the window as we receive more data
+ // instead of shrinking it.
+ RcvAcc seqnum.Value
+
+ // RcvWndScale is the window scaling to use for inbound segments.
+ RcvWndScale uint8
+
+ // PendingBufUsed is the number of bytes pending in the receive queue.
+ PendingBufUsed int
+}
+
+// TCPRTTState holds a copy of information about the endpoint's round trip
+// time.
+//
+// +stateify savable
+type TCPRTTState struct {
+ // SRTT is the smoothed round trip time defined in section 2 of RFC
+ // 6298.
+ SRTT time.Duration
+
+ // RTTVar is the round-trip time variation as defined in section 2 of
+ // RFC 6298.
+ RTTVar time.Duration
+
+ // SRTTInited if true indicates that a valid RTT measurement has been
+ // completed.
+ SRTTInited bool
+}
+
+// TCPSenderState holds a copy of the internal state of the sender for a given
+// TCP Endpoint.
+//
+// +stateify savable
+type TCPSenderState struct {
+ // LastSendTime is the timestamp at which we sent the last segment.
+ LastSendTime time.Time `state:".(unixTime)"`
+
+ // DupAckCount is the number of Duplicate ACKs received. It is used for
+ // fast retransmit.
+ DupAckCount int
+
+ // SndCwnd is the size of the sending congestion window in packets.
+ SndCwnd int
+
+ // Ssthresh is the threshold between slow start and congestion
+ // avoidance.
+ Ssthresh int
+
+ // SndCAAckCount is the number of packets acknowledged during
+ // congestion avoidance. When enough packets have been ack'd (typically
+ // cwnd packets), the congestion window is incremented by one.
+ SndCAAckCount int
+
+ // Outstanding is the number of packets that have been sent but not yet
+ // acknowledged.
+ Outstanding int
+
+ // SackedOut is the number of packets which have been selectively
+ // acked.
+ SackedOut int
+
+ // SndWnd is the send window size in bytes.
+ SndWnd seqnum.Size
+
+ // SndUna is the next unacknowledged sequence number.
+ SndUna seqnum.Value
+
+ // SndNxt is the sequence number of the next segment to be sent.
+ SndNxt seqnum.Value
+
+ // RTTMeasureSeqNum is the sequence number being used for the latest
+ // RTT measurement.
+ RTTMeasureSeqNum seqnum.Value
+
+ // RTTMeasureTime is the time when the RTTMeasureSeqNum was sent.
+ RTTMeasureTime time.Time `state:".(unixTime)"`
+
+ // Closed indicates that the caller has closed the endpoint for
+ // sending.
+ Closed bool
+
+ // RTO is the retransmit timeout as defined in section of 2 of RFC
+ // 6298.
+ RTO time.Duration
+
+ // RTTState holds information about the endpoint's round trip time.
+ RTTState TCPRTTState
+
+ // MaxPayloadSize is the maximum size of the payload of a given
+ // segment. It is initialized on demand.
+ MaxPayloadSize int
+
+ // SndWndScale is the number of bits to shift left when reading the
+ // send window size from a segment.
+ SndWndScale uint8
+
+ // MaxSentAck is the highest acknowledgement number sent till now.
+ MaxSentAck seqnum.Value
+
+ // FastRecovery holds the fast recovery state for the endpoint.
+ FastRecovery TCPFastRecoveryState
+
+ // Cubic holds the state related to CUBIC congestion control.
+ Cubic TCPCubicState
+
+ // RACKState holds the state related to RACK loss detection algorithm.
+ RACKState TCPRACKState
+}
+
+// TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
+//
+// +stateify savable
+type TCPSACKInfo struct {
+ // Blocks is the list of SACK Blocks that identify the out of order
+ // segments held by a given TCP endpoint.
+ Blocks []header.SACKBlock
+
+ // ReceivedBlocks are the SACK blocks received by this endpoint from
+ // the peer endpoint.
+ ReceivedBlocks []header.SACKBlock
+
+ // MaxSACKED is the highest sequence number that has been SACKED by the
+ // peer.
+ MaxSACKED seqnum.Value
+}
+
+// RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning.
+//
+// +stateify savable
+type RcvBufAutoTuneParams struct {
+ // MeasureTime is the time at which the current measurement was
+ // started.
+ MeasureTime time.Time `state:".(unixTime)"`
+
+ // CopiedBytes is the number of bytes copied to user space since this
+ // measure began.
+ CopiedBytes int
+
+ // PrevCopiedBytes is the number of bytes copied to userspace in the
+ // previous RTT period.
+ PrevCopiedBytes int
+
+ // RcvBufSize is the auto tuned receive buffer size.
+ RcvBufSize int
+
+ // RTT is the smoothed RTT as measured by observing the time between
+ // when a byte is first acknowledged and the receipt of data that is at
+ // least one window beyond the sequence number that was acknowledged.
+ RTT time.Duration
+
+ // RTTVar is the "round-trip time variation" as defined in section 2 of
+ // RFC6298.
+ RTTVar time.Duration
+
+ // RTTMeasureSeqNumber is the highest acceptable sequence number at the
+ // time this RTT measurement period began.
+ RTTMeasureSeqNumber seqnum.Value
+
+ // RTTMeasureTime is the absolute time at which the current RTT
+ // measurement period began.
+ RTTMeasureTime time.Time `state:".(unixTime)"`
+
+ // Disabled is true if an explicit receive buffer is set for the
+ // endpoint.
+ Disabled bool
+}
+
+// TCPRcvBufState contains information about the state of an endpoint's receive
+// socket buffer.
+//
+// +stateify savable
+type TCPRcvBufState struct {
+ // RcvBufUsed is the amount of bytes actually held in the receive
+ // socket buffer for the endpoint.
+ RcvBufUsed int
+
+ // RcvBufAutoTuneParams is used to hold state variables to compute the
+ // auto tuned receive buffer size.
+ RcvAutoParams RcvBufAutoTuneParams
+
+ // RcvClosed if true, indicates the endpoint has been closed for
+ // reading.
+ RcvClosed bool
+}
+
+// TCPSndBufState contains information about the state of an endpoint's send
+// socket buffer.
+//
+// +stateify savable
+type TCPSndBufState struct {
+ // SndBufSize is the size of the socket send buffer.
+ SndBufSize int
+
+ // SndBufUsed is the number of bytes held in the socket send buffer.
+ SndBufUsed int
+
+ // SndClosed indicates that the endpoint has been closed for sends.
+ SndClosed bool
+
+ // SndBufInQueue is the number of bytes in the send queue.
+ SndBufInQueue seqnum.Size
+
+ // PacketTooBigCount is used to notify the main protocol routine how
+ // many times a "packet too big" control packet is received.
+ PacketTooBigCount int
+
+ // SndMTU is the smallest MTU seen in the control packets received.
+ SndMTU int
+}
+
+// TCPEndpointStateInner contains the members of TCPEndpointState used directly
+// (that is, not within another containing struct) within the endpoint's
+// internal implementation.
+//
+// +stateify savable
+type TCPEndpointStateInner struct {
+ // TSOffset is a randomized offset added to the value of the TSVal
+ // field in the timestamp option.
+ TSOffset uint32
+
+ // SACKPermitted is set to true if the peer sends the TCPSACKPermitted
+ // option in the SYN/SYN-ACK.
+ SACKPermitted bool
+
+ // SendTSOk is used to indicate when the TS Option has been negotiated.
+ // When sendTSOk is true every non-RST segment should carry a TS as per
+ // RFC7323#section-1.1.
+ SendTSOk bool
+
+ // RecentTS is the timestamp that should be sent in the TSEcr field of
+ // the timestamp for future segments sent by the endpoint. This field
+ // is updated if required when a new segment is received by this
+ // endpoint.
+ RecentTS uint32
+}
+
+// TCPEndpointState is a copy of the internal state of a TCP endpoint.
+//
+// +stateify savable
+type TCPEndpointState struct {
+ // TCPEndpointStateInner contains the members of TCPEndpointState used
+ // by the endpoint's internal implementation.
+ TCPEndpointStateInner
+
+ // ID is a copy of the TransportEndpointID for the endpoint.
+ ID TCPEndpointID
+
+ // SegTime denotes the absolute time when this segment was received.
+ SegTime time.Time `state:".(unixTime)"`
+
+ // RcvBufState contains information about the state of the endpoint's
+ // receive socket buffer.
+ RcvBufState TCPRcvBufState
+
+ // SndBufState contains information about the state of the endpoint's
+ // send socket buffer.
+ SndBufState TCPSndBufState
+
+ // SACK holds TCP SACK related information for this endpoint.
+ SACK TCPSACKInfo
+
+ // Receiver holds variables related to the TCP receiver for the
+ // endpoint.
+ Receiver TCPReceiverState
+
+ // Sender holds state related to the TCP Sender for the endpoint.
+ Sender TCPSenderState
+}
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index e188efccb..80ad1a9d4 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -150,16 +150,17 @@ func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint {
return eps
}
-// HandlePacket is called by the stack when new packets arrive to this transport
-// endpoint.
-func (epsByNIC *endpointsByNIC) handlePacket(id TransportEndpointID, pkt *PacketBuffer) {
+// handlePacket is called by the stack when new packets arrive to this transport
+// endpoint. It returns false if the packet could not be matched to any
+// transport endpoint, true otherwise.
+func (epsByNIC *endpointsByNIC) handlePacket(id TransportEndpointID, pkt *PacketBuffer) bool {
epsByNIC.mu.RLock()
mpep, ok := epsByNIC.endpoints[pkt.NICID]
if !ok {
if mpep, ok = epsByNIC.endpoints[0]; !ok {
epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
- return
+ return false
}
}
@@ -168,18 +169,19 @@ func (epsByNIC *endpointsByNIC) handlePacket(id TransportEndpointID, pkt *Packet
if isInboundMulticastOrBroadcast(pkt, id.LocalAddress) {
mpep.handlePacketAll(id, pkt)
epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
- return
+ return true
}
// multiPortEndpoints are guaranteed to have at least one element.
transEP := selectEndpoint(id, mpep, epsByNIC.seed)
if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue {
queuedProtocol.QueuePacket(transEP, id, pkt)
epsByNIC.mu.RUnlock()
- return
+ return true
}
transEP.HandlePacket(id, pkt)
epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
+ return true
}
// handleError delivers an error to the transport endpoint identified by id.
@@ -567,8 +569,7 @@ func (d *transportDemuxer) deliverPacket(protocol tcpip.TransportProtocolNumber,
}
return false
}
- ep.handlePacket(id, pkt)
- return true
+ return ep.handlePacket(id, pkt)
}
// deliverRawPacket attempts to deliver the given packet and returns whether it
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 054cced0c..0adedd7c0 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -70,7 +70,7 @@ func (f *fakeTransportEndpoint) SocketOptions() *tcpip.SocketOptions {
func newFakeTransportEndpoint(proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, s *stack.Stack) tcpip.Endpoint {
ep := &fakeTransportEndpoint{TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: s.UniqueID()}
- ep.ops.InitHandler(ep, s, tcpip.GetStackSendBufferLimits)
+ ep.ops.InitHandler(ep, s, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits)
return ep
}
@@ -233,7 +233,7 @@ func (f *fakeTransportEndpoint) HandlePacket(id stack.TransportEndpointID, pkt *
peerAddr: route.RemoteAddress(),
route: route,
}
- ep.ops.InitHandler(ep, f.proto.stack, tcpip.GetStackSendBufferLimits)
+ ep.ops.InitHandler(ep, f.proto.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits)
f.acceptQueue = append(f.acceptQueue, ep)
}