summaryrefslogtreecommitdiffhomepage
path: root/pkg/tcpip
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/tcpip')
-rw-r--r--pkg/tcpip/buffer/prependable.go5
-rw-r--r--pkg/tcpip/checker/checker.go61
-rw-r--r--pkg/tcpip/header/BUILD19
-rw-r--r--pkg/tcpip/header/checksum.go50
-rw-r--r--pkg/tcpip/header/checksum_test.go109
-rw-r--r--pkg/tcpip/header/eth.go62
-rw-r--r--pkg/tcpip/header/eth_test.go68
-rw-r--r--pkg/tcpip/header/icmpv6.go32
-rw-r--r--pkg/tcpip/header/ipv6.go12
-rw-r--r--pkg/tcpip/header/ndp_neighbor_advert.go110
-rw-r--r--pkg/tcpip/header/ndp_neighbor_solicit.go52
-rw-r--r--pkg/tcpip/header/ndp_options.go172
-rw-r--r--pkg/tcpip/header/ndp_router_advert.go112
-rw-r--r--pkg/tcpip/header/ndp_test.go199
-rw-r--r--pkg/tcpip/link/channel/channel.go48
-rw-r--r--pkg/tcpip/link/fdbased/endpoint.go143
-rw-r--r--pkg/tcpip/link/fdbased/endpoint_test.go21
-rw-r--r--pkg/tcpip/link/fdbased/mmap.go5
-rw-r--r--pkg/tcpip/link/fdbased/packet_dispatchers.go22
-rw-r--r--pkg/tcpip/link/loopback/BUILD1
-rw-r--r--pkg/tcpip/link/loopback/loopback.go27
-rw-r--r--pkg/tcpip/link/muxed/injectable.go34
-rw-r--r--pkg/tcpip/link/muxed/injectable_test.go2
-rw-r--r--pkg/tcpip/link/rawfile/BUILD5
-rw-r--r--pkg/tcpip/link/rawfile/rawfile_unsafe.go11
-rw-r--r--pkg/tcpip/link/sharedmem/sharedmem.go22
-rw-r--r--pkg/tcpip/link/sharedmem/sharedmem_test.go16
-rw-r--r--pkg/tcpip/link/sniffer/sniffer.go85
-rw-r--r--pkg/tcpip/link/waitable/waitable.go28
-rw-r--r--pkg/tcpip/link/waitable/waitable_test.go19
-rw-r--r--pkg/tcpip/network/arp/arp.go9
-rw-r--r--pkg/tcpip/network/fragmentation/BUILD1
-rw-r--r--pkg/tcpip/network/fragmentation/fragmentation.go16
-rw-r--r--pkg/tcpip/network/fragmentation/fragmentation_test.go10
-rw-r--r--pkg/tcpip/network/fragmentation/reassembler.go10
-rw-r--r--pkg/tcpip/network/ip_test.go9
-rw-r--r--pkg/tcpip/network/ipv4/ipv4.go35
-rw-r--r--pkg/tcpip/network/ipv4/ipv4_test.go14
-rw-r--r--pkg/tcpip/network/ipv6/icmp.go132
-rw-r--r--pkg/tcpip/network/ipv6/icmp_test.go12
-rw-r--r--pkg/tcpip/network/ipv6/ipv6.go30
-rw-r--r--pkg/tcpip/network/ipv6/ndp_test.go4
-rw-r--r--pkg/tcpip/stack/BUILD5
-rw-r--r--pkg/tcpip/stack/ndp.go319
-rw-r--r--pkg/tcpip/stack/ndp_test.go443
-rw-r--r--pkg/tcpip/stack/nic.go309
-rw-r--r--pkg/tcpip/stack/registration.go83
-rw-r--r--pkg/tcpip/stack/route.go48
-rw-r--r--pkg/tcpip/stack/stack.go231
-rw-r--r--pkg/tcpip/stack/stack_test.go302
-rw-r--r--pkg/tcpip/stack/transport_demuxer.go2
-rw-r--r--pkg/tcpip/tcpip.go13
-rw-r--r--pkg/tcpip/transport/packet/BUILD46
-rw-r--r--pkg/tcpip/transport/packet/endpoint.go363
-rw-r--r--pkg/tcpip/transport/packet/endpoint_state.go72
-rw-r--r--pkg/tcpip/transport/raw/BUILD15
-rw-r--r--pkg/tcpip/transport/raw/endpoint.go30
-rw-r--r--pkg/tcpip/transport/raw/endpoint_state.go14
-rw-r--r--pkg/tcpip/transport/raw/protocol.go12
-rw-r--r--pkg/tcpip/transport/tcp/BUILD1
-rw-r--r--pkg/tcpip/transport/tcp/connect.go77
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go18
-rw-r--r--pkg/tcpip/transport/udp/endpoint.go2
-rw-r--r--pkg/tcpip/transport/udp/protocol.go11
64 files changed, 3947 insertions, 303 deletions
diff --git a/pkg/tcpip/buffer/prependable.go b/pkg/tcpip/buffer/prependable.go
index 4287464f3..48a2a2713 100644
--- a/pkg/tcpip/buffer/prependable.go
+++ b/pkg/tcpip/buffer/prependable.go
@@ -41,6 +41,11 @@ func NewPrependableFromView(v View) Prependable {
return Prependable{buf: v, usedIdx: 0}
}
+// NewEmptyPrependableFromView creates a new prependable buffer from a View.
+func NewEmptyPrependableFromView(v View) Prependable {
+ return Prependable{buf: v, usedIdx: len(v)}
+}
+
// View returns a View of the backing buffer that contains all prepended
// data so far.
func (p Prependable) View() View {
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 096ad71ab..02137e1c9 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -686,3 +686,64 @@ func ICMPv6Code(want byte) TransportChecker {
}
}
}
+
+// NDP creates a checker that checks that the packet contains a valid NDP
+// message for type of ty, with potentially additional checks specified by
+// checkers.
+//
+// checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDP message as far as the size of the message (minSize) is concerned. The
+// values within the message are up to checkers to validate.
+func NDP(msgType header.ICMPv6Type, minSize int, checkers ...TransportChecker) NetworkChecker {
+ return func(t *testing.T, h []header.Network) {
+ t.Helper()
+
+ // Check normal ICMPv6 first.
+ ICMPv6(
+ ICMPv6Type(msgType),
+ ICMPv6Code(0))(t, h)
+
+ last := h[len(h)-1]
+
+ icmp := header.ICMPv6(last.Payload())
+ if got := len(icmp.NDPPayload()); got < minSize {
+ t.Fatalf("ICMPv6 NDP (type = %d) payload size of %d is less than the minimum size of %d", msgType, got, minSize)
+ }
+
+ for _, f := range checkers {
+ f(t, icmp)
+ }
+ if t.Failed() {
+ t.FailNow()
+ }
+ }
+}
+
+// NDPNS creates a checker that checks that the packet contains a valid NDP
+// Neighbor Solicitation message (as per the raw wire format), with potentially
+// additional checks specified by checkers.
+//
+// checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPNS message as far as the size of the messages concerned. The values within
+// the message are up to checkers to validate.
+func NDPNS(checkers ...TransportChecker) NetworkChecker {
+ return NDP(header.ICMPv6NeighborSolicit, header.NDPNSMinimumSize, checkers...)
+}
+
+// NDPNSTargetAddress creates a checker that checks the Target Address field of
+// a header.NDPNeighborSolicit.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNS message as far as the size is concerned.
+func NDPNSTargetAddress(want tcpip.Address) TransportChecker {
+ return func(t *testing.T, h header.Transport) {
+ t.Helper()
+
+ icmp := h.(header.ICMPv6)
+ ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+
+ if got := ns.TargetAddress(); got != want {
+ t.Fatalf("got %T.TargetAddress = %s, want = %s", ns, got, want)
+ }
+ }
+}
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index a255231a3..a3485b35c 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -16,6 +16,10 @@ go_library(
"ipv4.go",
"ipv6.go",
"ipv6_fragment.go",
+ "ndp_neighbor_advert.go",
+ "ndp_neighbor_solicit.go",
+ "ndp_options.go",
+ "ndp_router_advert.go",
"tcp.go",
"udp.go",
],
@@ -30,13 +34,26 @@ go_library(
)
go_test(
- name = "header_test",
+ name = "header_x_test",
size = "small",
srcs = [
+ "checksum_test.go",
"ipversion_test.go",
"tcp_test.go",
],
deps = [
":header",
+ "//pkg/tcpip/buffer",
+ ],
+)
+
+go_test(
+ name = "header_test",
+ size = "small",
+ srcs = [
+ "eth_test.go",
+ "ndp_test.go",
],
+ embed = [":header"],
+ deps = ["//pkg/tcpip"],
)
diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go
index 39a4d69be..9749c7f4d 100644
--- a/pkg/tcpip/header/checksum.go
+++ b/pkg/tcpip/header/checksum.go
@@ -23,11 +23,17 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/buffer"
)
-func calculateChecksum(buf []byte, initial uint32) uint16 {
+func calculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bool) {
v := initial
+ if odd {
+ v += uint32(buf[0])
+ buf = buf[1:]
+ }
+
l := len(buf)
- if l&1 != 0 {
+ odd = l&1 != 0
+ if odd {
l--
v += uint32(buf[l]) << 8
}
@@ -36,7 +42,7 @@ func calculateChecksum(buf []byte, initial uint32) uint16 {
v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
}
- return ChecksumCombine(uint16(v), uint16(v>>16))
+ return ChecksumCombine(uint16(v), uint16(v>>16)), odd
}
// Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the
@@ -44,7 +50,8 @@ func calculateChecksum(buf []byte, initial uint32) uint16 {
//
// The initial checksum must have been computed on an even number of bytes.
func Checksum(buf []byte, initial uint16) uint16 {
- return calculateChecksum(buf, uint32(initial))
+ s, _ := calculateChecksum(buf, false, uint32(initial))
+ return s
}
// ChecksumVV calculates the checksum (as defined in RFC 1071) of the bytes in
@@ -52,19 +59,40 @@ func Checksum(buf []byte, initial uint16) uint16 {
//
// The initial checksum must have been computed on an even number of bytes.
func ChecksumVV(vv buffer.VectorisedView, initial uint16) uint16 {
- var odd bool
+ return ChecksumVVWithOffset(vv, initial, 0, vv.Size())
+}
+
+// ChecksumVVWithOffset calculates the checksum (as defined in RFC 1071) of the
+// bytes in the given VectorizedView.
+//
+// The initial checksum must have been computed on an even number of bytes.
+func ChecksumVVWithOffset(vv buffer.VectorisedView, initial uint16, off int, size int) uint16 {
+ odd := false
sum := initial
for _, v := range vv.Views() {
if len(v) == 0 {
continue
}
- s := uint32(sum)
- if odd {
- s += uint32(v[0])
- v = v[1:]
+
+ if off >= len(v) {
+ off -= len(v)
+ continue
+ }
+ v = v[off:]
+
+ l := len(v)
+ if l > size {
+ l = size
+ }
+ v = v[:l]
+
+ sum, odd = calculateChecksum(v, odd, uint32(sum))
+
+ size -= len(v)
+ if size == 0 {
+ break
}
- odd = len(v)&1 != 0
- sum = calculateChecksum(v, s)
+ off = 0
}
return sum
}
diff --git a/pkg/tcpip/header/checksum_test.go b/pkg/tcpip/header/checksum_test.go
new file mode 100644
index 000000000..86b466c1c
--- /dev/null
+++ b/pkg/tcpip/header/checksum_test.go
@@ -0,0 +1,109 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package header provides the implementation of the encoding and decoding of
+// network protocol headers.
+package header_test
+
+import (
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/tcpip/buffer"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+func TestChecksumVVWithOffset(t *testing.T) {
+ testCases := []struct {
+ name string
+ vv buffer.VectorisedView
+ off, size int
+ initial uint16
+ want uint16
+ }{
+ {
+ name: "empty",
+ vv: buffer.NewVectorisedView(0, []buffer.View{
+ buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}),
+ }),
+ off: 0,
+ size: 0,
+ want: 0,
+ },
+ {
+ name: "OneView",
+ vv: buffer.NewVectorisedView(0, []buffer.View{
+ buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}),
+ }),
+ off: 0,
+ size: 5,
+ want: 1294,
+ },
+ {
+ name: "TwoViews",
+ vv: buffer.NewVectorisedView(0, []buffer.View{
+ buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}),
+ buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}),
+ }),
+ off: 0,
+ size: 11,
+ want: 33819,
+ },
+ {
+ name: "TwoViewsWithOffset",
+ vv: buffer.NewVectorisedView(0, []buffer.View{
+ buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+ buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}),
+ }),
+ off: 1,
+ size: 11,
+ want: 33819,
+ },
+ {
+ name: "ThreeViewsWithOffset",
+ vv: buffer.NewVectorisedView(0, []buffer.View{
+ buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+ buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+ buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}),
+ }),
+ off: 7,
+ size: 11,
+ want: 33819,
+ },
+ {
+ name: "ThreeViewsWithInitial",
+ vv: buffer.NewVectorisedView(0, []buffer.View{
+ buffer.NewViewFromBytes([]byte{77, 11, 33, 0, 55, 44}),
+ buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+ buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123, 99}),
+ }),
+ initial: 77,
+ off: 7,
+ size: 11,
+ want: 33896,
+ },
+ }
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ if got, want := header.ChecksumVVWithOffset(tc.vv, tc.initial, tc.off, tc.size), tc.want; got != want {
+ t.Errorf("header.ChecksumVVWithOffset(%v) = %v, want: %v", tc, got, tc.want)
+ }
+ v := tc.vv.ToView()
+ v.TrimFront(tc.off)
+ v.CapLength(tc.size)
+ if got, want := header.Checksum(v, tc.initial), tc.want; got != want {
+ t.Errorf("header.Checksum(%v) = %v, want: %v", tc, got, tc.want)
+ }
+ })
+ }
+}
diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
index 4c3d3311f..f5d2c127f 100644
--- a/pkg/tcpip/header/eth.go
+++ b/pkg/tcpip/header/eth.go
@@ -48,8 +48,48 @@ const (
// EthernetAddressSize is the size, in bytes, of an ethernet address.
EthernetAddressSize = 6
+
+ // unspecifiedEthernetAddress is the unspecified ethernet address
+ // (all bits set to 0).
+ unspecifiedEthernetAddress = tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00")
+
+ // unicastMulticastFlagMask is the mask of the least significant bit in
+ // the first octet (in network byte order) of an ethernet address that
+ // determines whether the ethernet address is a unicast or multicast. If
+ // the masked bit is a 1, then the address is a multicast, unicast
+ // otherwise.
+ //
+ // See the IEEE Std 802-2001 document for more details. Specifically,
+ // section 9.2.1 of http://ieee802.org/secmail/pdfocSP2xXA6d.pdf:
+ // "A 48-bit universal address consists of two parts. The first 24 bits
+ // correspond to the OUI as assigned by the IEEE, expect that the
+ // assignee may set the LSB of the first octet to 1 for group addresses
+ // or set it to 0 for individual addresses."
+ unicastMulticastFlagMask = 1
+
+ // unicastMulticastFlagByteIdx is the byte that holds the
+ // unicast/multicast flag. See unicastMulticastFlagMask.
+ unicastMulticastFlagByteIdx = 0
+)
+
+const (
+ // EthernetProtocolAll is a catch-all for all protocols carried inside
+ // an ethernet frame. It is mainly used to create packet sockets that
+ // capture all traffic.
+ EthernetProtocolAll tcpip.NetworkProtocolNumber = 0x0003
+
+ // EthernetProtocolPUP is the PARC Universial Packet protocol ethertype.
+ EthernetProtocolPUP tcpip.NetworkProtocolNumber = 0x0200
)
+// Ethertypes holds the protocol numbers describing the payload of an ethernet
+// frame. These types aren't necessarily supported by netstack, but can be used
+// to catch all traffic of a type via packet endpoints.
+var Ethertypes = []tcpip.NetworkProtocolNumber{
+ EthernetProtocolAll,
+ EthernetProtocolPUP,
+}
+
// SourceAddress returns the "MAC source" field of the ethernet frame header.
func (b Ethernet) SourceAddress() tcpip.LinkAddress {
return tcpip.LinkAddress(b[srcMAC:][:EthernetAddressSize])
@@ -72,3 +112,25 @@ func (b Ethernet) Encode(e *EthernetFields) {
copy(b[srcMAC:][:EthernetAddressSize], e.SrcAddr)
copy(b[dstMAC:][:EthernetAddressSize], e.DstAddr)
}
+
+// IsValidUnicastEthernetAddress returns true if addr is a valid unicast
+// ethernet address.
+func IsValidUnicastEthernetAddress(addr tcpip.LinkAddress) bool {
+ // Must be of the right length.
+ if len(addr) != EthernetAddressSize {
+ return false
+ }
+
+ // Must not be unspecified.
+ if addr == unspecifiedEthernetAddress {
+ return false
+ }
+
+ // Must not be a multicast.
+ if addr[unicastMulticastFlagByteIdx]&unicastMulticastFlagMask != 0 {
+ return false
+ }
+
+ // addr is a valid unicast ethernet address.
+ return true
+}
diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go
new file mode 100644
index 000000000..6634c90f5
--- /dev/null
+++ b/pkg/tcpip/header/eth_test.go
@@ -0,0 +1,68 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/tcpip"
+)
+
+func TestIsValidUnicastEthernetAddress(t *testing.T) {
+ tests := []struct {
+ name string
+ addr tcpip.LinkAddress
+ expected bool
+ }{
+ {
+ "Nil",
+ tcpip.LinkAddress([]byte(nil)),
+ false,
+ },
+ {
+ "Empty",
+ tcpip.LinkAddress(""),
+ false,
+ },
+ {
+ "InvalidLength",
+ tcpip.LinkAddress("\x01\x02\x03"),
+ false,
+ },
+ {
+ "Unspecified",
+ unspecifiedEthernetAddress,
+ false,
+ },
+ {
+ "Multicast",
+ tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+ false,
+ },
+ {
+ "Valid",
+ tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06"),
+ true,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ if got := IsValidUnicastEthernetAddress(test.addr); got != test.expected {
+ t.Fatalf("got IsValidUnicastEthernetAddress = %t, want = %t", got, test.expected)
+ }
+ })
+ }
+}
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index 1125a7d14..c2bfd8c79 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -25,6 +25,12 @@ import (
type ICMPv6 []byte
const (
+ // ICMPv6HeaderSize is the size of the ICMPv6 header. That is, the
+ // sum of the size of the ICMPv6 Type, Code and Checksum fields, as
+ // per RFC 4443 section 2.1. After the ICMPv6 header, the ICMPv6
+ // message body begins.
+ ICMPv6HeaderSize = 4
+
// ICMPv6MinimumSize is the minimum size of a valid ICMP packet.
ICMPv6MinimumSize = 8
@@ -37,10 +43,16 @@ const (
// ICMPv6NeighborSolicitMinimumSize is the minimum size of a
// neighbor solicitation packet.
- ICMPv6NeighborSolicitMinimumSize = ICMPv6MinimumSize + 16
+ ICMPv6NeighborSolicitMinimumSize = ICMPv6HeaderSize + NDPNSMinimumSize
+
+ // ICMPv6NeighborAdvertMinimumSize is the minimum size of a
+ // neighbor advertisement packet.
+ ICMPv6NeighborAdvertMinimumSize = ICMPv6HeaderSize + NDPNAMinimumSize
- // ICMPv6NeighborAdvertSize is size of a neighbor advertisement.
- ICMPv6NeighborAdvertSize = 32
+ // ICMPv6NeighborAdvertSize is size of a neighbor advertisement
+ // including the NDP Target Link Layer option for an Ethernet
+ // address.
+ ICMPv6NeighborAdvertSize = ICMPv6HeaderSize + NDPNAMinimumSize + ndpTargetEthernetLinkLayerAddressSize
// ICMPv6EchoMinimumSize is the minimum size of a valid ICMP echo packet.
ICMPv6EchoMinimumSize = 8
@@ -68,6 +80,13 @@ const (
// icmpv6SequenceOffset is the offset of the sequence field
// in a ICMPv6 Echo Request/Reply message.
icmpv6SequenceOffset = 6
+
+ // NDPHopLimit is the expected IP hop limit value of 255 for received
+ // NDP packets, as per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1,
+ // 7.1.2 and 8.1. If the hop limit value is not 255, nodes MUST silently
+ // drop the NDP packet. All outgoing NDP packets must use this value for
+ // its IP hop limit field.
+ NDPHopLimit = 255
)
// ICMPv6Type is the ICMP type field described in RFC 4443 and friends.
@@ -166,6 +185,13 @@ func (b ICMPv6) SetSequence(sequence uint16) {
binary.BigEndian.PutUint16(b[icmpv6SequenceOffset:], sequence)
}
+// NDPPayload returns the NDP payload buffer. That is, it returns the ICMPv6
+// packet's message body as defined by RFC 4443 section 2.1; the portion of the
+// ICMPv6 buffer after the first ICMPv6HeaderSize bytes.
+func (b ICMPv6) NDPPayload() []byte {
+ return b[ICMPv6HeaderSize:]
+}
+
// Payload implements Transport.Payload.
func (b ICMPv6) Payload() []byte {
return b[ICMPv6PayloadOffset:]
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 9d3abc0e4..f1e60911b 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -87,7 +87,8 @@ const (
// section 5.
IPv6MinimumMTU = 1280
- // IPv6Any is the non-routable IPv6 "any" meta address.
+ // IPv6Any is the non-routable IPv6 "any" meta address. It is also
+ // known as the unspecified address.
IPv6Any tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
)
@@ -100,6 +101,15 @@ var IPv6EmptySubnet = func() tcpip.Subnet {
return subnet
}()
+// IPv6LinkLocalPrefix is the prefix for IPv6 link-local addresses, as defined
+// by RFC 4291 section 2.5.6.
+//
+// The prefix is fe80::/64
+var IPv6LinkLocalPrefix = tcpip.AddressWithPrefix{
+ Address: "\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+ PrefixLen: 64,
+}
+
// PayloadLength returns the value of the "payload length" field of the ipv6
// header.
func (b IPv6) PayloadLength() uint16 {
diff --git a/pkg/tcpip/header/ndp_neighbor_advert.go b/pkg/tcpip/header/ndp_neighbor_advert.go
new file mode 100644
index 000000000..505c92668
--- /dev/null
+++ b/pkg/tcpip/header/ndp_neighbor_advert.go
@@ -0,0 +1,110 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import "gvisor.dev/gvisor/pkg/tcpip"
+
+// NDPNeighborAdvert is an NDP Neighbor Advertisement message. It will
+// only contain the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.4 for more details.
+type NDPNeighborAdvert []byte
+
+const (
+ // NDPNAMinimumSize is the minimum size of a valid NDP Neighbor
+ // Advertisement message (body of an ICMPv6 packet).
+ NDPNAMinimumSize = 20
+
+ // ndpNATargetAddressOffset is the start of the Target Address
+ // field within an NDPNeighborAdvert.
+ ndpNATargetAddressOffset = 4
+
+ // ndpNAOptionsOffset is the start of the NDP options in an
+ // NDPNeighborAdvert.
+ ndpNAOptionsOffset = ndpNATargetAddressOffset + IPv6AddressSize
+
+ // ndpNAFlagsOffset is the offset of the flags within an
+ // NDPNeighborAdvert
+ ndpNAFlagsOffset = 0
+
+ // ndpNARouterFlagMask is the mask of the Router Flag field in
+ // the flags byte within in an NDPNeighborAdvert.
+ ndpNARouterFlagMask = (1 << 7)
+
+ // ndpNASolicitedFlagMask is the mask of the Solicited Flag field in
+ // the flags byte within in an NDPNeighborAdvert.
+ ndpNASolicitedFlagMask = (1 << 6)
+
+ // ndpNAOverrideFlagMask is the mask of the Override Flag field in
+ // the flags byte within in an NDPNeighborAdvert.
+ ndpNAOverrideFlagMask = (1 << 5)
+)
+
+// TargetAddress returns the value within the Target Address field.
+func (b NDPNeighborAdvert) TargetAddress() tcpip.Address {
+ return tcpip.Address(b[ndpNATargetAddressOffset:][:IPv6AddressSize])
+}
+
+// SetTargetAddress sets the value within the Target Address field.
+func (b NDPNeighborAdvert) SetTargetAddress(addr tcpip.Address) {
+ copy(b[ndpNATargetAddressOffset:][:IPv6AddressSize], addr)
+}
+
+// RouterFlag returns the value of the Router Flag field.
+func (b NDPNeighborAdvert) RouterFlag() bool {
+ return b[ndpNAFlagsOffset]&ndpNARouterFlagMask != 0
+}
+
+// SetRouterFlag sets the value in the Router Flag field.
+func (b NDPNeighborAdvert) SetRouterFlag(f bool) {
+ if f {
+ b[ndpNAFlagsOffset] |= ndpNARouterFlagMask
+ } else {
+ b[ndpNAFlagsOffset] &^= ndpNARouterFlagMask
+ }
+}
+
+// SolicitedFlag returns the value of the Solicited Flag field.
+func (b NDPNeighborAdvert) SolicitedFlag() bool {
+ return b[ndpNAFlagsOffset]&ndpNASolicitedFlagMask != 0
+}
+
+// SetSolicitedFlag sets the value in the Solicited Flag field.
+func (b NDPNeighborAdvert) SetSolicitedFlag(f bool) {
+ if f {
+ b[ndpNAFlagsOffset] |= ndpNASolicitedFlagMask
+ } else {
+ b[ndpNAFlagsOffset] &^= ndpNASolicitedFlagMask
+ }
+}
+
+// OverrideFlag returns the value of the Override Flag field.
+func (b NDPNeighborAdvert) OverrideFlag() bool {
+ return b[ndpNAFlagsOffset]&ndpNAOverrideFlagMask != 0
+}
+
+// SetOverrideFlag sets the value in the Override Flag field.
+func (b NDPNeighborAdvert) SetOverrideFlag(f bool) {
+ if f {
+ b[ndpNAFlagsOffset] |= ndpNAOverrideFlagMask
+ } else {
+ b[ndpNAFlagsOffset] &^= ndpNAOverrideFlagMask
+ }
+}
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPNeighborAdvert) Options() NDPOptions {
+ return NDPOptions(b[ndpNAOptionsOffset:])
+}
diff --git a/pkg/tcpip/header/ndp_neighbor_solicit.go b/pkg/tcpip/header/ndp_neighbor_solicit.go
new file mode 100644
index 000000000..3a1b8e139
--- /dev/null
+++ b/pkg/tcpip/header/ndp_neighbor_solicit.go
@@ -0,0 +1,52 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import "gvisor.dev/gvisor/pkg/tcpip"
+
+// NDPNeighborSolicit is an NDP Neighbor Solicitation message. It will only
+// contain the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.3 for more details.
+type NDPNeighborSolicit []byte
+
+const (
+ // NDPNSMinimumSize is the minimum size of a valid NDP Neighbor
+ // Solicitation message (body of an ICMPv6 packet).
+ NDPNSMinimumSize = 20
+
+ // ndpNSTargetAddessOffset is the start of the Target Address
+ // field within an NDPNeighborSolicit.
+ ndpNSTargetAddessOffset = 4
+
+ // ndpNSOptionsOffset is the start of the NDP options in an
+ // NDPNeighborSolicit.
+ ndpNSOptionsOffset = ndpNSTargetAddessOffset + IPv6AddressSize
+)
+
+// TargetAddress returns the value within the Target Address field.
+func (b NDPNeighborSolicit) TargetAddress() tcpip.Address {
+ return tcpip.Address(b[ndpNSTargetAddessOffset:][:IPv6AddressSize])
+}
+
+// SetTargetAddress sets the value within the Target Address field.
+func (b NDPNeighborSolicit) SetTargetAddress(addr tcpip.Address) {
+ copy(b[ndpNSTargetAddessOffset:][:IPv6AddressSize], addr)
+}
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPNeighborSolicit) Options() NDPOptions {
+ return NDPOptions(b[ndpNSOptionsOffset:])
+}
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
new file mode 100644
index 000000000..b28bde15b
--- /dev/null
+++ b/pkg/tcpip/header/ndp_options.go
@@ -0,0 +1,172 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+ "gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+ // NDPTargetLinkLayerAddressOptionType is the type of the Target
+ // Link-Layer Address option, as per RFC 4861 section 4.6.1.
+ NDPTargetLinkLayerAddressOptionType = 2
+
+ // ndpTargetEthernetLinkLayerAddressSize is the size of a Target
+ // Link Layer Option for an Ethernet address.
+ ndpTargetEthernetLinkLayerAddressSize = 8
+
+ // lengthByteUnits is the multiplier factor for the Length field of an
+ // NDP option. That is, the length field for NDP options is in units of
+ // 8 octets, as per RFC 4861 section 4.6.
+ lengthByteUnits = 8
+)
+
+// NDPOptions is a buffer of NDP options as defined by RFC 4861 section 4.6.
+type NDPOptions []byte
+
+// Serialize serializes the provided list of NDP options into o.
+//
+// Note, b must be of sufficient size to hold all the options in s. See
+// NDPOptionsSerializer.Length for details on the getting the total size
+// of a serialized NDPOptionsSerializer.
+//
+// Serialize may panic if b is not of sufficient size to hold all the options
+// in s.
+func (b NDPOptions) Serialize(s NDPOptionsSerializer) int {
+ done := 0
+
+ for _, o := range s {
+ l := paddedLength(o)
+
+ if l == 0 {
+ continue
+ }
+
+ b[0] = o.Type()
+
+ // We know this safe because paddedLength would have returned
+ // 0 if o had an invalid length (> 255 * lengthByteUnits).
+ b[1] = uint8(l / lengthByteUnits)
+
+ // Serialize NDP option body.
+ used := o.serializeInto(b[2:])
+
+ // Zero out remaining (padding) bytes, if any exists.
+ for i := used + 2; i < l; i++ {
+ b[i] = 0
+ }
+
+ b = b[l:]
+ done += l
+ }
+
+ return done
+}
+
+// ndpOption is the set of functions to be implemented by all NDP option types.
+type ndpOption interface {
+ // Type returns the type of this ndpOption.
+ Type() uint8
+
+ // Length returns the length of the body of this ndpOption, in bytes.
+ Length() int
+
+ // serializeInto serializes this ndpOption into the provided byte
+ // buffer.
+ //
+ // Note, the caller MUST provide a byte buffer with size of at least
+ // Length. Implementers of this function may assume that the byte buffer
+ // is of sufficient size. serializeInto MAY panic if the provided byte
+ // buffer is not of sufficient size.
+ //
+ // serializeInto will return the number of bytes that was used to
+ // serialize this ndpOption. Implementers must only use the number of
+ // bytes required to serialize this ndpOption. Callers MAY provide a
+ // larger buffer than required to serialize into.
+ serializeInto([]byte) int
+}
+
+// paddedLength returns the length of o, in bytes, with any padding bytes, if
+// required.
+func paddedLength(o ndpOption) int {
+ l := o.Length()
+
+ if l == 0 {
+ return 0
+ }
+
+ // Length excludes the 2 Type and Length bytes.
+ l += 2
+
+ // Add extra bytes if needed to make sure the option is
+ // lengthByteUnits-byte aligned. We do this by adding lengthByteUnits-1
+ // to l and then stripping off the last few LSBits from l. This will
+ // make sure that l is rounded up to the nearest unit of
+ // lengthByteUnits. This works since lengthByteUnits is a power of 2
+ // (= 8).
+ mask := lengthByteUnits - 1
+ l += mask
+ l &^= mask
+
+ if l/lengthByteUnits > 255 {
+ // Should never happen because an option can only have a max
+ // value of 255 for its Length field, so just return 0 so this
+ // option does not get serialized.
+ //
+ // Returning 0 here will make sure that this option does not get
+ // serialized when NDPOptions.Serialize is called with the
+ // NDPOptionsSerializer that holds this option, effectively
+ // skipping this option during serialization. Also note that
+ // a value of zero for the Length field in an NDP option is
+ // invalid so this is another sign to the caller that this NDP
+ // option is malformed, as per RFC 4861 section 4.6.
+ return 0
+ }
+
+ return l
+}
+
+// NDPOptionsSerializer is a serializer for NDP options.
+type NDPOptionsSerializer []ndpOption
+
+// Length returns the total number of bytes required to serialize.
+func (b NDPOptionsSerializer) Length() int {
+ l := 0
+
+ for _, o := range b {
+ l += paddedLength(o)
+ }
+
+ return l
+}
+
+// NDPTargetLinkLayerAddressOption is the NDP Target Link Layer Option
+// as defined by RFC 4861 section 4.6.1.
+type NDPTargetLinkLayerAddressOption tcpip.LinkAddress
+
+// Type implements ndpOption.Type.
+func (o NDPTargetLinkLayerAddressOption) Type() uint8 {
+ return NDPTargetLinkLayerAddressOptionType
+}
+
+// Length implements ndpOption.Length.
+func (o NDPTargetLinkLayerAddressOption) Length() int {
+ return len(o)
+}
+
+// serializeInto implements ndpOption.serializeInto.
+func (o NDPTargetLinkLayerAddressOption) serializeInto(b []byte) int {
+ return copy(b, o)
+}
diff --git a/pkg/tcpip/header/ndp_router_advert.go b/pkg/tcpip/header/ndp_router_advert.go
new file mode 100644
index 000000000..bf7610863
--- /dev/null
+++ b/pkg/tcpip/header/ndp_router_advert.go
@@ -0,0 +1,112 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+ "encoding/binary"
+ "time"
+)
+
+// NDPRouterAdvert is an NDP Router Advertisement message. It will only contain
+// the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.2 for more details.
+type NDPRouterAdvert []byte
+
+const (
+ // NDPRAMinimumSize is the minimum size of a valid NDP Router
+ // Advertisement message (body of an ICMPv6 packet).
+ NDPRAMinimumSize = 12
+
+ // ndpRACurrHopLimitOffset is the byte of the Curr Hop Limit field
+ // within an NDPRouterAdvert.
+ ndpRACurrHopLimitOffset = 0
+
+ // ndpRAFlagsOffset is the byte with the NDP RA bit-fields/flags
+ // within an NDPRouterAdvert.
+ ndpRAFlagsOffset = 1
+
+ // ndpRAManagedAddrConfFlagMask is the mask of the Managed Address
+ // Configuration flag within the bit-field/flags byte of an
+ // NDPRouterAdvert.
+ ndpRAManagedAddrConfFlagMask = (1 << 7)
+
+ // ndpRAOtherConfFlagMask is the mask of the Other Configuration flag
+ // within the bit-field/flags byte of an NDPRouterAdvert.
+ ndpRAOtherConfFlagMask = (1 << 6)
+
+ // ndpRARouterLifetimeOffset is the start of the 2-byte Router Lifetime
+ // field within an NDPRouterAdvert.
+ ndpRARouterLifetimeOffset = 2
+
+ // ndpRAReachableTimeOffset is the start of the 4-byte Reachable Time
+ // field within an NDPRouterAdvert.
+ ndpRAReachableTimeOffset = 4
+
+ // ndpRARetransTimerOffset is the start of the 4-byte Retrans Timer
+ // field within an NDPRouterAdvert.
+ ndpRARetransTimerOffset = 8
+
+ // ndpRAOptionsOffset is the start of the NDP options in an
+ // NDPRouterAdvert.
+ ndpRAOptionsOffset = 12
+)
+
+// CurrHopLimit returns the value of the Curr Hop Limit field.
+func (b NDPRouterAdvert) CurrHopLimit() uint8 {
+ return b[ndpRACurrHopLimitOffset]
+}
+
+// ManagedAddrConfFlag returns the value of the Managed Address Configuration
+// flag.
+func (b NDPRouterAdvert) ManagedAddrConfFlag() bool {
+ return b[ndpRAFlagsOffset]&ndpRAManagedAddrConfFlagMask != 0
+}
+
+// OtherConfFlag returns the value of the Other Configuration flag.
+func (b NDPRouterAdvert) OtherConfFlag() bool {
+ return b[ndpRAFlagsOffset]&ndpRAOtherConfFlagMask != 0
+}
+
+// RouterLifetime returns the lifetime associated with the default router. A
+// value of 0 means the source of the Router Advertisement is not a default
+// router and SHOULD NOT appear on the default router list. Note, a value of 0
+// only means that the router should not be used as a default router, it does
+// not apply to other information contained in the Router Advertisement.
+func (b NDPRouterAdvert) RouterLifetime() time.Duration {
+ // The field is the time in seconds, as per RFC 4861 section 4.2.
+ return time.Second * time.Duration(binary.BigEndian.Uint16(b[ndpRARouterLifetimeOffset:]))
+}
+
+// ReachableTime returns the time that a node assumes a neighbor is reachable
+// after having received a reachability confirmation. A value of 0 means
+// that it is unspecified by the source of the Router Advertisement message.
+func (b NDPRouterAdvert) ReachableTime() time.Duration {
+ // The field is the time in milliseconds, as per RFC 4861 section 4.2.
+ return time.Millisecond * time.Duration(binary.BigEndian.Uint32(b[ndpRAReachableTimeOffset:]))
+}
+
+// RetransTimer returns the time between retransmitted Neighbor Solicitation
+// messages. A value of 0 means that it is unspecified by the source of the
+// Router Advertisement message.
+func (b NDPRouterAdvert) RetransTimer() time.Duration {
+ // The field is the time in milliseconds, as per RFC 4861 section 4.2.
+ return time.Millisecond * time.Duration(binary.BigEndian.Uint32(b[ndpRARetransTimerOffset:]))
+}
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPRouterAdvert) Options() NDPOptions {
+ return NDPOptions(b[ndpRAOptionsOffset:])
+}
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
new file mode 100644
index 000000000..0aac14f43
--- /dev/null
+++ b/pkg/tcpip/header/ndp_test.go
@@ -0,0 +1,199 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+ "bytes"
+ "testing"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// TestNDPNeighborSolicit tests the functions of NDPNeighborSolicit.
+func TestNDPNeighborSolicit(t *testing.T) {
+ b := []byte{
+ 0, 0, 0, 0,
+ 1, 2, 3, 4,
+ 5, 6, 7, 8,
+ 9, 10, 11, 12,
+ 13, 14, 15, 16,
+ }
+
+ // Test getting the Target Address.
+ ns := NDPNeighborSolicit(b)
+ addr := tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10")
+ if got := ns.TargetAddress(); got != addr {
+ t.Fatalf("got ns.TargetAddress = %s, want %s", got, addr)
+ }
+
+ // Test updating the Target Address.
+ addr2 := tcpip.Address("\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x11")
+ ns.SetTargetAddress(addr2)
+ if got := ns.TargetAddress(); got != addr2 {
+ t.Fatalf("got ns.TargetAddress = %s, want %s", got, addr2)
+ }
+ // Make sure the address got updated in the backing buffer.
+ if got := tcpip.Address(b[ndpNSTargetAddessOffset:][:IPv6AddressSize]); got != addr2 {
+ t.Fatalf("got targetaddress buffer = %s, want %s", got, addr2)
+ }
+}
+
+// TestNDPNeighborAdvert tests the functions of NDPNeighborAdvert.
+func TestNDPNeighborAdvert(t *testing.T) {
+ b := []byte{
+ 160, 0, 0, 0,
+ 1, 2, 3, 4,
+ 5, 6, 7, 8,
+ 9, 10, 11, 12,
+ 13, 14, 15, 16,
+ }
+
+ // Test getting the Target Address.
+ na := NDPNeighborAdvert(b)
+ addr := tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10")
+ if got := na.TargetAddress(); got != addr {
+ t.Fatalf("got TargetAddress = %s, want %s", got, addr)
+ }
+
+ // Test getting the Router Flag.
+ if got := na.RouterFlag(); !got {
+ t.Fatalf("got RouterFlag = false, want = true")
+ }
+
+ // Test getting the Solicited Flag.
+ if got := na.SolicitedFlag(); got {
+ t.Fatalf("got SolicitedFlag = true, want = false")
+ }
+
+ // Test getting the Override Flag.
+ if got := na.OverrideFlag(); !got {
+ t.Fatalf("got OverrideFlag = false, want = true")
+ }
+
+ // Test updating the Target Address.
+ addr2 := tcpip.Address("\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x11")
+ na.SetTargetAddress(addr2)
+ if got := na.TargetAddress(); got != addr2 {
+ t.Fatalf("got TargetAddress = %s, want %s", got, addr2)
+ }
+ // Make sure the address got updated in the backing buffer.
+ if got := tcpip.Address(b[ndpNATargetAddressOffset:][:IPv6AddressSize]); got != addr2 {
+ t.Fatalf("got targetaddress buffer = %s, want %s", got, addr2)
+ }
+
+ // Test updating the Router Flag.
+ na.SetRouterFlag(false)
+ if got := na.RouterFlag(); got {
+ t.Fatalf("got RouterFlag = true, want = false")
+ }
+
+ // Test updating the Solicited Flag.
+ na.SetSolicitedFlag(true)
+ if got := na.SolicitedFlag(); !got {
+ t.Fatalf("got SolicitedFlag = false, want = true")
+ }
+
+ // Test updating the Override Flag.
+ na.SetOverrideFlag(false)
+ if got := na.OverrideFlag(); got {
+ t.Fatalf("got OverrideFlag = true, want = false")
+ }
+
+ // Make sure flags got updated in the backing buffer.
+ if got := b[ndpNAFlagsOffset]; got != 64 {
+ t.Fatalf("got flags byte = %d, want = 64")
+ }
+}
+
+func TestNDPRouterAdvert(t *testing.T) {
+ b := []byte{
+ 64, 128, 1, 2,
+ 3, 4, 5, 6,
+ 7, 8, 9, 10,
+ }
+
+ ra := NDPRouterAdvert(b)
+
+ if got := ra.CurrHopLimit(); got != 64 {
+ t.Fatalf("got ra.CurrHopLimit = %d, want = 64", got)
+ }
+
+ if got := ra.ManagedAddrConfFlag(); !got {
+ t.Fatalf("got ManagedAddrConfFlag = false, want = true")
+ }
+
+ if got := ra.OtherConfFlag(); got {
+ t.Fatalf("got OtherConfFlag = true, want = false")
+ }
+
+ if got, want := ra.RouterLifetime(), time.Second*258; got != want {
+ t.Fatalf("got ra.RouterLifetime = %d, want = %d", got, want)
+ }
+
+ if got, want := ra.ReachableTime(), time.Millisecond*50595078; got != want {
+ t.Fatalf("got ra.ReachableTime = %d, want = %d", got, want)
+ }
+
+ if got, want := ra.RetransTimer(), time.Millisecond*117967114; got != want {
+ t.Fatalf("got ra.RetransTimer = %d, want = %d", got, want)
+ }
+}
+
+// TestNDPTargetLinkLayerAddressOptionSerialize tests serializing a
+// NDPTargetLinkLayerAddressOption.
+func TestNDPTargetLinkLayerAddressOptionSerialize(t *testing.T) {
+ tests := []struct {
+ name string
+ buf []byte
+ expectedBuf []byte
+ addr tcpip.LinkAddress
+ }{
+ {
+ "Ethernet",
+ make([]byte, 8),
+ []byte{2, 1, 1, 2, 3, 4, 5, 6},
+ "\x01\x02\x03\x04\x05\x06",
+ },
+ {
+ "Padding",
+ []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+ []byte{2, 2, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0},
+ "\x01\x02\x03\x04\x05\x06\x07\x08",
+ },
+ {
+ "Empty",
+ []byte{},
+ []byte{},
+ "",
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ opts := NDPOptions(test.buf)
+ serializer := NDPOptionsSerializer{
+ NDPTargetLinkLayerAddressOption(test.addr),
+ }
+ if got, want := int(serializer.Length()), len(test.expectedBuf); got != want {
+ t.Fatalf("got Length = %d, want = %d", got, want)
+ }
+ opts.Serialize(serializer)
+ if !bytes.Equal(test.buf, test.expectedBuf) {
+ t.Fatalf("got b = %d, want = %d", test.buf, test.expectedBuf)
+ }
+ })
+ }
+}
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 18adb2085..14f197a77 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -72,7 +72,7 @@ func (e *Endpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.Vector
// InjectLinkAddr injects an inbound packet with a remote link address.
func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, vv buffer.VectorisedView) {
- e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, vv.Clone(nil))
+ e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, vv.Clone(nil), nil /* linkHeader */)
}
// Attach saves the stack network-layer dispatcher for use later when packets
@@ -96,7 +96,7 @@ func (e *Endpoint) MTU() uint32 {
func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
caps := stack.LinkEndpointCapabilities(0)
if e.GSO {
- caps |= stack.CapabilityGSO
+ caps |= stack.CapabilityHardwareGSO
}
return caps
}
@@ -134,5 +134,49 @@ func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, hdr buffer.Prepen
return nil
}
+// WritePackets stores outbound packets into the channel.
+func (e *Endpoint) WritePackets(_ *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ payloadView := payload.ToView()
+ n := 0
+packetLoop:
+ for i := range hdrs {
+ hdr := &hdrs[i].Hdr
+ off := hdrs[i].Off
+ size := hdrs[i].Size
+ p := PacketInfo{
+ Header: hdr.View(),
+ Proto: protocol,
+ Payload: buffer.NewViewFromBytes(payloadView[off : off+size]),
+ GSO: gso,
+ }
+
+ select {
+ case e.C <- p:
+ n++
+ default:
+ break packetLoop
+ }
+ }
+
+ return n, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *Endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+ p := PacketInfo{
+ Header: packet.ToView(),
+ Proto: 0,
+ Payload: buffer.View{},
+ GSO: nil,
+ }
+
+ select {
+ case e.C <- p:
+ default:
+ }
+
+ return nil
+}
+
// Wait implements stack.LinkEndpoint.Wait.
func (*Endpoint) Wait() {}
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index f80ac3435..ae4858529 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -165,6 +165,9 @@ type Options struct {
// disabled.
GSOMaxSize uint32
+ // SoftwareGSOEnabled indicates whether software GSO is enabled or not.
+ SoftwareGSOEnabled bool
+
// PacketDispatchMode specifies the type of inbound dispatcher to be
// used for this endpoint.
PacketDispatchMode PacketDispatchMode
@@ -242,7 +245,11 @@ func New(opts *Options) (stack.LinkEndpoint, error) {
}
if isSocket {
if opts.GSOMaxSize != 0 {
- e.caps |= stack.CapabilityGSO
+ if opts.SoftwareGSOEnabled {
+ e.caps |= stack.CapabilitySoftwareGSO
+ } else {
+ e.caps |= stack.CapabilityHardwareGSO
+ }
e.gsoMaxSize = opts.GSOMaxSize
}
}
@@ -397,7 +404,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
eth.Encode(ethHdr)
}
- if e.Capabilities()&stack.CapabilityGSO != 0 {
+ if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
vnetHdr := virtioNetHdr{}
vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr)
if gso != nil {
@@ -430,8 +437,130 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil)
}
-// WriteRawPacket writes a raw packet directly to the file descriptor.
-func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error {
+// WritePackets writes outbound packets to the file descriptor. If it is not
+// currently writable, the packet is dropped.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ var ethHdrBuf []byte
+ // hdr + data
+ iovLen := 2
+ if e.hdrSize > 0 {
+ // Add ethernet header if needed.
+ ethHdrBuf = make([]byte, header.EthernetMinimumSize)
+ eth := header.Ethernet(ethHdrBuf)
+ ethHdr := &header.EthernetFields{
+ DstAddr: r.RemoteLinkAddress,
+ Type: protocol,
+ }
+
+ // Preserve the src address if it's set in the route.
+ if r.LocalLinkAddress != "" {
+ ethHdr.SrcAddr = r.LocalLinkAddress
+ } else {
+ ethHdr.SrcAddr = e.addr
+ }
+ eth.Encode(ethHdr)
+ iovLen++
+ }
+
+ n := len(hdrs)
+
+ views := payload.Views()
+ /*
+ * Each bondary in views can add one more iovec.
+ *
+ * payload | | | |
+ * -----------------------------
+ * packets | | | | | | |
+ * -----------------------------
+ * iovecs | | | | | | | | |
+ */
+ iovec := make([]syscall.Iovec, n*iovLen+len(views)-1)
+ mmsgHdrs := make([]rawfile.MMsgHdr, n)
+
+ iovecIdx := 0
+ viewIdx := 0
+ viewOff := 0
+ off := 0
+ nextOff := 0
+ for i := range hdrs {
+ prevIovecIdx := iovecIdx
+ mmsgHdr := &mmsgHdrs[i]
+ mmsgHdr.Msg.Iov = &iovec[iovecIdx]
+ packetSize := hdrs[i].Size
+ hdr := &hdrs[i].Hdr
+
+ off = hdrs[i].Off
+ if off != nextOff {
+ // We stop in a different point last time.
+ size := packetSize
+ viewIdx = 0
+ viewOff = 0
+ for size > 0 {
+ if size >= len(views[viewIdx]) {
+ viewIdx++
+ viewOff = 0
+ size -= len(views[viewIdx])
+ } else {
+ viewOff = size
+ size = 0
+ }
+ }
+ }
+ nextOff = off + packetSize
+
+ if ethHdrBuf != nil {
+ v := &iovec[iovecIdx]
+ v.Base = &ethHdrBuf[0]
+ v.Len = uint64(len(ethHdrBuf))
+ iovecIdx++
+ }
+
+ v := &iovec[iovecIdx]
+ hdrView := hdr.View()
+ v.Base = &hdrView[0]
+ v.Len = uint64(len(hdrView))
+ iovecIdx++
+
+ for packetSize > 0 {
+ vec := &iovec[iovecIdx]
+ iovecIdx++
+
+ v := views[viewIdx]
+ vec.Base = &v[viewOff]
+ s := len(v) - viewOff
+ if s <= packetSize {
+ viewIdx++
+ viewOff = 0
+ } else {
+ s = packetSize
+ viewOff += s
+ }
+ vec.Len = uint64(s)
+ packetSize -= s
+ }
+
+ mmsgHdr.Msg.Iovlen = uint64(iovecIdx - prevIovecIdx)
+ }
+
+ packets := 0
+ for packets < n {
+ sent, err := rawfile.NonBlockingSendMMsg(e.fds[0], mmsgHdrs)
+ if err != nil {
+ return packets, err
+ }
+ packets += sent
+ mmsgHdrs = mmsgHdrs[sent:]
+ }
+ return packets, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+ return rawfile.NonBlockingWrite(e.fds[0], packet.ToView())
+}
+
+// InjectOutobund implements stack.InjectableEndpoint.InjectOutbound.
+func (e *endpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error {
return rawfile.NonBlockingWrite(e.fds[0], packet)
}
@@ -468,9 +597,9 @@ func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
e.dispatcher = dispatcher
}
-// Inject injects an inbound packet.
-func (e *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
- e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv)
+// InjectInbound injects an inbound packet.
+func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+ e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
}
// NewInjectable creates a new fd-based InjectableEndpoint.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 04406bc9a..59378b96c 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -43,9 +43,10 @@ const (
)
type packetInfo struct {
- raddr tcpip.LinkAddress
- proto tcpip.NetworkProtocolNumber
- contents buffer.View
+ raddr tcpip.LinkAddress
+ proto tcpip.NetworkProtocolNumber
+ contents buffer.View
+ linkHeader buffer.View
}
type context struct {
@@ -92,8 +93,8 @@ func (c *context) cleanup() {
syscall.Close(c.fds[1])
}
-func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
- c.ch <- packetInfo{remote, protocol, vv.ToView()}
+func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
+ c.ch <- packetInfo{remote, protocol, vv.ToView(), linkHeader}
}
func TestNoEthernetProperties(t *testing.T) {
@@ -293,11 +294,12 @@ func TestDeliverPacket(t *testing.T) {
b[i] = uint8(rand.Intn(256))
}
+ var hdr header.Ethernet
if !eth {
// So that it looks like an IPv4 packet.
b[0] = 0x40
} else {
- hdr := make(header.Ethernet, header.EthernetMinimumSize)
+ hdr = make(header.Ethernet, header.EthernetMinimumSize)
hdr.Encode(&header.EthernetFields{
SrcAddr: raddr,
DstAddr: laddr,
@@ -315,9 +317,10 @@ func TestDeliverPacket(t *testing.T) {
select {
case pi := <-c.ch:
want := packetInfo{
- raddr: raddr,
- proto: proto,
- contents: b,
+ raddr: raddr,
+ proto: proto,
+ contents: b,
+ linkHeader: buffer.View(hdr),
}
if !eth {
want.proto = header.IPv4ProtocolNumber
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index 8bfeb97e4..554d45715 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -169,9 +169,10 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
var (
p tcpip.NetworkProtocolNumber
remote, local tcpip.LinkAddress
+ eth header.Ethernet
)
if d.e.hdrSize > 0 {
- eth := header.Ethernet(pkt)
+ eth = header.Ethernet(pkt)
p = eth.Type()
remote = eth.SourceAddress()
local = eth.DestinationAddress()
@@ -189,6 +190,6 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
}
pkt = pkt[d.e.hdrSize:]
- d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}))
+ d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}), buffer.View(eth))
return true, nil
}
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
index 7ca217e5b..12168a1dc 100644
--- a/pkg/tcpip/link/fdbased/packet_dispatchers.go
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -53,7 +53,7 @@ func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
d := &readVDispatcher{fd: fd, e: e}
d.views = make([]buffer.View, len(BufConfig))
iovLen := len(BufConfig)
- if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
iovLen++
}
d.iovecs = make([]syscall.Iovec, iovLen)
@@ -63,7 +63,7 @@ func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
func (d *readVDispatcher) allocateViews(bufConfig []int) {
var vnetHdr [virtioNetHdrSize]byte
vnetHdrOff := 0
- if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
// The kernel adds virtioNetHdr before each packet, but
// we don't use it, so so we allocate a buffer for it,
// add it in iovecs but don't add it in a view.
@@ -106,7 +106,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
if err != nil {
return false, err
}
- if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
// Skip virtioNetHdr which is added before each packet, it
// isn't used and it isn't in a view.
n -= virtioNetHdrSize
@@ -118,9 +118,10 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
var (
p tcpip.NetworkProtocolNumber
remote, local tcpip.LinkAddress
+ eth header.Ethernet
)
if d.e.hdrSize > 0 {
- eth := header.Ethernet(d.views[0])
+ eth = header.Ethernet(d.views[0][:header.EthernetMinimumSize])
p = eth.Type()
remote = eth.SourceAddress()
local = eth.DestinationAddress()
@@ -141,7 +142,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
vv := buffer.NewVectorisedView(n, d.views[:used])
vv.TrimFront(d.e.hdrSize)
- d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv)
+ d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth))
// Prepare e.views for another packet: release used views.
for i := 0; i < used; i++ {
@@ -194,7 +195,7 @@ func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
}
d.iovecs = make([][]syscall.Iovec, MaxMsgsPerRecv)
iovLen := len(BufConfig)
- if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
// virtioNetHdr is prepended before each packet.
iovLen++
}
@@ -225,7 +226,7 @@ func (d *recvMMsgDispatcher) allocateViews(bufConfig []int) {
for k := 0; k < len(d.views); k++ {
var vnetHdr [virtioNetHdrSize]byte
vnetHdrOff := 0
- if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
// The kernel adds virtioNetHdr before each packet, but
// we don't use it, so so we allocate a buffer for it,
// add it in iovecs but don't add it in a view.
@@ -261,7 +262,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
// Process each of received packets.
for k := 0; k < nMsgs; k++ {
n := int(d.msgHdrs[k].Len)
- if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
n -= virtioNetHdrSize
}
if n <= d.e.hdrSize {
@@ -271,9 +272,10 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
var (
p tcpip.NetworkProtocolNumber
remote, local tcpip.LinkAddress
+ eth header.Ethernet
)
if d.e.hdrSize > 0 {
- eth := header.Ethernet(d.views[k][0])
+ eth = header.Ethernet(d.views[k][0])
p = eth.Type()
remote = eth.SourceAddress()
local = eth.DestinationAddress()
@@ -293,7 +295,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
used := d.capViews(k, int(n), BufConfig)
vv := buffer.NewVectorisedView(int(n), d.views[k][:used])
vv.TrimFront(d.e.hdrSize)
- d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv)
+ d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv, buffer.View(eth))
// Prepare e.views for another packet: release used views.
for i := 0; i < used; i++ {
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
index 47a54845c..23e4d1418 100644
--- a/pkg/tcpip/link/loopback/BUILD
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -10,6 +10,7 @@ go_library(
deps = [
"//pkg/tcpip",
"//pkg/tcpip/buffer",
+ "//pkg/tcpip/header",
"//pkg/tcpip/stack",
],
)
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index b36629d2c..a3b48fa73 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -23,6 +23,7 @@ package loopback
import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
@@ -70,6 +71,9 @@ func (*endpoint) LinkAddress() tcpip.LinkAddress {
return ""
}
+// Wait implements stack.LinkEndpoint.Wait.
+func (*endpoint) Wait() {}
+
// WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound
// packets to the network-layer dispatcher.
func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
@@ -81,10 +85,27 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prependa
// Because we're immediately turning around and writing the packet back to the
// rx path, we intentionally don't preserve the remote and local link
// addresses from the stack.Route we're passed.
- e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv)
+ e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
return nil
}
-// Wait implements stack.LinkEndpoint.Wait.
-func (*endpoint) Wait() {}
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ panic("not implemented")
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+ // Reject the packet if it's shorter than an ethernet header.
+ if packet.Size() < header.EthernetMinimumSize {
+ return tcpip.ErrBadAddress
+ }
+
+ // There should be an ethernet header at the beginning of packet.
+ linkHeader := header.Ethernet(packet.First()[:header.EthernetMinimumSize])
+ packet.TrimFront(len(linkHeader))
+ e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), packet, buffer.View(linkHeader))
+
+ return nil
+}
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 7c946101d..682b60291 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -79,29 +79,47 @@ func (m *InjectableEndpoint) IsAttached() bool {
return m.dispatcher != nil
}
-// Inject implements stack.InjectableLinkEndpoint.
-func (m *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
- m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, vv)
+// InjectInbound implements stack.InjectableLinkEndpoint.
+func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+ m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */)
+}
+
+// WritePackets writes outbound packets to the appropriate
+// LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if
+// r.RemoteAddress has a route registered in this endpoint.
+func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ endpoint, ok := m.routes[r.RemoteAddress]
+ if !ok {
+ return 0, tcpip.ErrNoRoute
+ }
+ return endpoint.WritePackets(r, gso, hdrs, payload, protocol)
}
// WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint
// based on the RemoteAddress. HandleLocal only works if r.RemoteAddress has a
// route registered in this endpoint.
-func (m *InjectableEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
if endpoint, ok := m.routes[r.RemoteAddress]; ok {
- return endpoint.WritePacket(r, nil /* gso */, hdr, payload, protocol)
+ return endpoint.WritePacket(r, gso, hdr, payload, protocol)
}
return tcpip.ErrNoRoute
}
-// WriteRawPacket writes outbound packets to the appropriate
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (m *InjectableEndpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+ // WriteRawPacket doesn't get a route or network address, so there's
+ // nowhere to write this.
+ return tcpip.ErrNoRoute
+}
+
+// InjectOutbound writes outbound packets to the appropriate
// LinkInjectableEndpoint based on the dest address.
-func (m *InjectableEndpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error {
+func (m *InjectableEndpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error {
endpoint, ok := m.routes[dest]
if !ok {
return tcpip.ErrNoRoute
}
- return endpoint.WriteRawPacket(dest, packet)
+ return endpoint.InjectOutbound(dest, packet)
}
// Wait implements stack.LinkEndpoint.Wait.
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 3086fec00..9cd300af8 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -31,7 +31,7 @@ import (
func TestInjectableEndpointRawDispatch(t *testing.T) {
endpoint, sock, dstIP := makeTestInjectableEndpoint(t)
- endpoint.WriteRawPacket(dstIP, []byte{0xFA})
+ endpoint.InjectOutbound(dstIP, []byte{0xFA})
buf := make([]byte, ipv4.MaxTotalSize)
bytesRead, err := sock.Read(buf)
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index 2e8bc772a..05c7b8024 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -16,5 +16,8 @@ go_library(
visibility = [
"//visibility:public",
],
- deps = ["//pkg/tcpip"],
+ deps = [
+ "//pkg/tcpip",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
)
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index 7e286a3a6..44e25d475 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -22,6 +22,7 @@ import (
"syscall"
"unsafe"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/tcpip"
)
@@ -101,6 +102,16 @@ func NonBlockingWrite3(fd int, b1, b2, b3 []byte) *tcpip.Error {
return nil
}
+// NonBlockingSendMMsg sends multiple messages on a socket.
+func NonBlockingSendMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) {
+ n, _, e := syscall.RawSyscall6(unix.SYS_SENDMMSG, uintptr(fd), uintptr(unsafe.Pointer(&msgHdrs[0])), uintptr(len(msgHdrs)), syscall.MSG_DONTWAIT, 0, 0)
+ if e != 0 {
+ return 0, TranslateErrno(e)
+ }
+
+ return int(n), nil
+}
+
// PollEvent represents the pollfd structure passed to a poll() system call.
type PollEvent struct {
FD int32
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 9e71d4edf..279e2b457 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -212,6 +212,26 @@ func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.Prependa
return nil
}
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ panic("not implemented")
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+ v := packet.ToView()
+ // Transmit the packet.
+ e.mu.Lock()
+ ok := e.tx.transmit(v, buffer.View{})
+ e.mu.Unlock()
+
+ if !ok {
+ return tcpip.ErrWouldBlock
+ }
+
+ return nil
+}
+
// dispatchLoop reads packets from the rx queue in a loop and dispatches them
// to the network stack.
func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
@@ -254,7 +274,7 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
// Send packet up the stack.
eth := header.Ethernet(b)
- d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView())
+ d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(), buffer.View(eth))
}
// Clean state.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 0e9ba0846..f3e9705c9 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -78,9 +78,10 @@ func (q *queueBuffers) cleanup() {
}
type packetInfo struct {
- addr tcpip.LinkAddress
- proto tcpip.NetworkProtocolNumber
- vv buffer.VectorisedView
+ addr tcpip.LinkAddress
+ proto tcpip.NetworkProtocolNumber
+ vv buffer.VectorisedView
+ linkHeader buffer.View
}
type testContext struct {
@@ -130,12 +131,13 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress
return c
}
-func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
c.mu.Lock()
c.packets = append(c.packets, packetInfo{
- addr: remoteLinkAddr,
- proto: proto,
- vv: vv.Clone(nil),
+ addr: remoteLinkAddr,
+ proto: proto,
+ vv: vv.Clone(nil),
+ linkHeader: linkHeader,
})
c.mu.Unlock()
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index e401dce44..39757ea2a 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -116,7 +116,7 @@ func NewWithFile(lower stack.LinkEndpoint, file *os.File, snapLen uint32) (stack
// DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
// called by the link-layer endpoint being wrapped when a packet arrives, and
// logs the packet before forwarding to the actual dispatcher.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
logPacket("recv", protocol, vv.First(), nil)
}
@@ -147,7 +147,7 @@ func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local
panic(err)
}
}
- e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv)
+ e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv, linkHeader)
}
// Attach implements the stack.LinkEndpoint interface. It saves the dispatcher
@@ -193,10 +193,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
return 0
}
-// WritePacket implements the stack.LinkEndpoint interface. It is called by
-// higher-level protocols to write packets; it just logs the packet and forwards
-// the request to the lower endpoint.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+func (e *endpoint) dumpPacket(gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) {
if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
logPacket("send", protocol, hdr.View(), gso)
}
@@ -218,28 +215,74 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
panic(err)
}
length -= len(hdrBuf)
- if length > 0 {
- for _, v := range payload.Views() {
- if len(v) > length {
- v = v[:length]
- }
- n, err := buf.Write(v)
- if err != nil {
- panic(err)
- }
- length -= n
- if length == 0 {
- break
- }
- }
- }
+ logVectorisedView(payload, length, buf)
if _, err := e.file.Write(buf.Bytes()); err != nil {
panic(err)
}
}
+}
+
+// WritePacket implements the stack.LinkEndpoint interface. It is called by
+// higher-level protocols to write packets; it just logs the packet and
+// forwards the request to the lower endpoint.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
+ e.dumpPacket(gso, hdr, payload, protocol)
return e.lower.WritePacket(r, gso, hdr, payload, protocol)
}
+// WritePackets implements the stack.LinkEndpoint interface. It is called by
+// higher-level protocols to write packets; it just logs the packet and
+// forwards the request to the lower endpoint.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ view := payload.ToView()
+ for _, d := range hdrs {
+ e.dumpPacket(gso, d.Hdr, buffer.NewVectorisedView(d.Size, []buffer.View{view[d.Off:][:d.Size]}), protocol)
+ }
+ return e.lower.WritePackets(r, gso, hdrs, payload, protocol)
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+ if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
+ logPacket("send", 0, buffer.View("[raw packet, no header available]"), nil /* gso */)
+ }
+ if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
+ length := packet.Size()
+ if length > int(e.maxPCAPLen) {
+ length = int(e.maxPCAPLen)
+ }
+
+ buf := bytes.NewBuffer(make([]byte, 0, pcapPacketHeaderLen+length))
+ if err := binary.Write(buf, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(packet.Size()))); err != nil {
+ panic(err)
+ }
+ logVectorisedView(packet, length, buf)
+ if _, err := e.file.Write(buf.Bytes()); err != nil {
+ panic(err)
+ }
+ }
+ return e.lower.WriteRawPacket(packet)
+}
+
+func logVectorisedView(vv buffer.VectorisedView, length int, buf *bytes.Buffer) {
+ if length <= 0 {
+ return
+ }
+ for _, v := range vv.Views() {
+ if len(v) > length {
+ v = v[:length]
+ }
+ n, err := buf.Write(v)
+ if err != nil {
+ panic(err)
+ }
+ length -= n
+ if length == 0 {
+ return
+ }
+ }
+}
+
// Wait implements stack.LinkEndpoint.Wait.
func (*endpoint) Wait() {}
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index 5a1791cb5..a04fc1062 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -50,12 +50,12 @@ func New(lower stack.LinkEndpoint) *Endpoint {
// It is called by the link-layer endpoint being wrapped when a packet arrives,
// and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
// been called.
-func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
if !e.dispatchGate.Enter() {
return
}
- e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv)
+ e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, vv, linkHeader)
e.dispatchGate.Leave()
}
@@ -109,6 +109,30 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
return err
}
+// WritePackets implements stack.LinkEndpoint.WritePackets. It is called by
+// higher-level protocols to write packets. It only forwards packets to the
+// lower endpoint if Wait or WaitWrite haven't been called.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ if !e.writeGate.Enter() {
+ return len(hdrs), nil
+ }
+
+ n, err := e.lower.WritePackets(r, gso, hdrs, payload, protocol)
+ e.writeGate.Leave()
+ return n, err
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *Endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+ if !e.writeGate.Enter() {
+ return nil
+ }
+
+ err := e.lower.WriteRawPacket(packet)
+ e.writeGate.Leave()
+ return err
+}
+
// WaitWrite prevents new calls to WritePacket from reaching the lower endpoint,
// and waits for inflight ones to finish before returning.
func (e *Endpoint) WaitWrite() {
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index ae23c96b7..5f0f8fa2d 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -35,7 +35,7 @@ type countedEndpoint struct {
dispatcher stack.NetworkDispatcher
}
-func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
e.dispatchCount++
}
@@ -70,6 +70,17 @@ func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.P
return nil
}
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ e.writeCount += len(hdrs)
+ return len(hdrs), nil
+}
+
+func (e *countedEndpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error {
+ e.writeCount++
+ return nil
+}
+
// Wait implements stack.LinkEndpoint.Wait.
func (*countedEndpoint) Wait() {}
@@ -109,21 +120,21 @@ func TestWaitDispatch(t *testing.T) {
}
// Dispatch and check that it goes through.
- ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{})
+ ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
if want := 1; ep.dispatchCount != want {
t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
}
// Wait on writes, then try to dispatch. It must go through.
wep.WaitWrite()
- ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{})
+ ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
if want := 2; ep.dispatchCount != want {
t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
}
// Wait on dispatches, then try to dispatch. It must not go through.
wep.WaitDispatch()
- ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{})
+ ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, buffer.VectorisedView{}, buffer.View{})
if want := 2; ep.dispatchCount != want {
t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 922181ac0..46178459e 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -83,6 +83,11 @@ func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, buffer.Prependable, buf
return tcpip.ErrNotSupported
}
+// WritePackets implements stack.NetworkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketDescriptor, buffer.VectorisedView, stack.NetworkHeaderParams, stack.PacketLooping) (int, *tcpip.Error) {
+ return 0, tcpip.ErrNotSupported
+}
+
func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
return tcpip.ErrNotSupported
}
@@ -109,7 +114,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
copy(pkt.HardwareAddressTarget(), h.HardwareAddressSender())
copy(pkt.ProtocolAddressTarget(), h.ProtocolAddressSender())
e.linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber)
+ fallthrough // also fill the cache from requests
case header.ARPReply:
+ addr := tcpip.Address(h.ProtocolAddressSender())
+ linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
+ e.linkAddrCache.AddLinkAddress(e.nicid, addr, linkAddr)
}
}
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index 825ff3392..2cad0a0b6 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -28,6 +28,7 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/log",
+ "//pkg/tcpip",
"//pkg/tcpip/buffer",
],
)
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 1628a82be..6da5238ec 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -17,6 +17,7 @@
package fragmentation
import (
+ "fmt"
"log"
"sync"
"time"
@@ -82,7 +83,7 @@ func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout t
// Process processes an incoming fragment belonging to an ID
// and returns a complete packet when all the packets belonging to that ID have been received.
-func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool) {
+func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) {
f.mu.Lock()
r, ok := f.reassemblers[id]
if ok && r.tooOld(f.timeout) {
@@ -97,8 +98,15 @@ func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buf
}
f.mu.Unlock()
- res, done, consumed := r.process(first, last, more, vv)
-
+ res, done, consumed, err := r.process(first, last, more, vv)
+ if err != nil {
+ // We probably got an invalid sequence of fragments. Just
+ // discard the reassembler and move on.
+ f.mu.Lock()
+ f.release(r)
+ f.mu.Unlock()
+ return buffer.VectorisedView{}, false, fmt.Errorf("fragmentation processing error: %v", err)
+ }
f.mu.Lock()
f.size += consumed
if done {
@@ -114,7 +122,7 @@ func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buf
}
}
f.mu.Unlock()
- return res, done
+ return res, done, nil
}
func (f *Fragmentation) release(r *reassembler) {
diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go
index 799798544..72c0f53be 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation_test.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go
@@ -83,7 +83,10 @@ func TestFragmentationProcess(t *testing.T) {
t.Run(c.comment, func(t *testing.T) {
f := NewFragmentation(1024, 512, DefaultReassembleTimeout)
for i, in := range c.in {
- vv, done := f.Process(in.id, in.first, in.last, in.more, in.vv)
+ vv, done, err := f.Process(in.id, in.first, in.last, in.more, in.vv)
+ if err != nil {
+ t.Fatalf("f.Process(%+v, %+d, %+d, %t, %+v) failed: %v", in.id, in.first, in.last, in.more, in.vv, err)
+ }
if !reflect.DeepEqual(vv, c.out[i].vv) {
t.Errorf("got Process(%d) = %+v, want = %+v", i, vv, c.out[i].vv)
}
@@ -114,7 +117,10 @@ func TestReassemblingTimeout(t *testing.T) {
time.Sleep(2 * timeout)
// Send another fragment that completes a packet.
// However, no packet should be reassembled because the fragment arrived after the timeout.
- _, done := f.Process(0, 1, 1, false, vv(1, "1"))
+ _, done, err := f.Process(0, 1, 1, false, vv(1, "1"))
+ if err != nil {
+ t.Fatalf("f.Process(0, 1, 1, false, vv(1, \"1\")) failed: %v", err)
+ }
if done {
t.Errorf("Fragmentation does not respect the reassembling timeout.")
}
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index 8037f734b..9e002e396 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -78,7 +78,7 @@ func (r *reassembler) updateHoles(first, last uint16, more bool) bool {
return used
}
-func (r *reassembler) process(first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, int) {
+func (r *reassembler) process(first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, int, error) {
r.mu.Lock()
defer r.mu.Unlock()
consumed := 0
@@ -86,7 +86,7 @@ func (r *reassembler) process(first, last uint16, more bool, vv buffer.Vectorise
// A concurrent goroutine might have already reassembled
// the packet and emptied the heap while this goroutine
// was waiting on the mutex. We don't have to do anything in this case.
- return buffer.VectorisedView{}, false, consumed
+ return buffer.VectorisedView{}, false, consumed, nil
}
if r.updateHoles(first, last, more) {
// We store the incoming packet only if it filled some holes.
@@ -96,13 +96,13 @@ func (r *reassembler) process(first, last uint16, more bool, vv buffer.Vectorise
}
// Check if all the holes have been deleted and we are ready to reassamble.
if r.deleted < len(r.holes) {
- return buffer.VectorisedView{}, false, consumed
+ return buffer.VectorisedView{}, false, consumed, nil
}
res, err := r.heap.reassemble()
if err != nil {
- panic(fmt.Sprintf("reassemble failed with: %v. There is probably a bug in the code handling the holes.", err))
+ return buffer.VectorisedView{}, false, consumed, fmt.Errorf("fragment reassembly failed: %v", err)
}
- return res, true, consumed
+ return res, true, consumed, nil
}
func (r *reassembler) tooOld(timeout time.Duration) bool {
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index f644a8b08..8d74497ba 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -171,6 +171,15 @@ func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prepen
return nil
}
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, hdr []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ panic("not implemented")
+}
+
+func (t *testObject) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error {
+ return tcpip.ErrNotSupported
+}
+
func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()},
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index df1a08113..1339f8474 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -198,10 +198,9 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, hdr buff
return nil
}
-// WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
+func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) {
ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
- length := uint16(hdr.UsedLength() + payload.Size())
+ length := uint16(hdr.UsedLength() + payloadSize)
id := uint32(0)
if length > header.IPv4MaximumHeaderSize+8 {
// Packets of 68 bytes or less are required by RFC 791 to not be
@@ -219,6 +218,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
DstAddr: r.RemoteAddress,
})
ip.SetChecksum(^ip.CalculateChecksum())
+}
+
+// WritePacket writes a packet to the given destination address and protocol.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
+ e.addIPHeader(r, &hdr, payload.Size(), params)
if loop&stack.PacketLoop != 0 {
views := make([]buffer.View, 1, 1+len(payload.Views()))
@@ -242,6 +246,23 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
return nil
}
+// WritePackets implements stack.NetworkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
+ if loop&stack.PacketLoop != 0 {
+ panic("multiple packets in local loop")
+ }
+ if loop&stack.PacketOut == 0 {
+ return len(hdrs), nil
+ }
+
+ for i := range hdrs {
+ e.addIPHeader(r, &hdrs[i].Hdr, hdrs[i].Size, params)
+ }
+ n, err := e.linkEP.WritePackets(r, gso, hdrs, payload, ProtocolNumber)
+ r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+ return n, err
+}
+
// WriteHeaderIncludedPacket writes a packet already containing a network
// header through the given route.
func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
@@ -326,7 +347,13 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
return
}
var ready bool
- vv, ready = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, vv)
+ var err error
+ vv, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, vv)
+ if err != nil {
+ r.Stats().IP.MalformedPacketsReceived.Increment()
+ r.Stats().IP.MalformedFragmentsReceived.Increment()
+ return
+ }
if !ready {
return
}
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 560638ce8..99f84acd7 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -47,10 +47,6 @@ func TestExcludeBroadcast(t *testing.T) {
t.Fatalf("CreateNIC failed: %v", err)
}
- if err := s.AddAddress(1, ipv4.ProtocolNumber, header.IPv4Any); err != nil {
- t.Fatalf("AddAddress failed: %v", err)
- }
-
s.SetRouteTable([]tcpip.Route{{
Destination: header.IPv4EmptySubnet,
NIC: 1,
@@ -441,6 +437,16 @@ func TestInvalidFragments(t *testing.T) {
1,
1,
},
+ {
+ "multiple_fragments_with_more_fragments_set_to_false",
+ [][]byte{
+ {0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x00, 0x10, 0x00, 0x06, 0x34, 0x69, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x00, 0x01, 0x61, 0x06, 0x34, 0x69, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x20, 0x00, 0x00, 0x06, 0x34, 0x1e, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ },
+ 1,
+ 1,
+ },
}
for _, tc := range testCases {
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index b5df85455..b289e902f 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -21,15 +21,6 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
-const (
- // ndpHopLimit is the expected IP hop limit value of 255 for received
- // NDP packets, as per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1,
- // 7.1.2 and 8.1. If the hop limit value is not 255, nodes MUST silently
- // drop the NDP packet. All outgoing NDP packets must use this value for
- // its IP hop limit field.
- ndpHopLimit = 255
-)
-
// handleControl handles the case when an ICMP packet contains the headers of
// the original packet that caused the ICMP one to be sent. This information is
// used to find out which transport endpoint must be notified about the ICMP
@@ -79,6 +70,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
return
}
h := header.ICMPv6(v)
+ iph := header.IPv6(netHeader)
// As per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, 7.1.2 and
// 8.1, nodes MUST silently drop NDP packets where the Hop Limit field
@@ -89,7 +81,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
header.ICMPv6RouterSolicit,
header.ICMPv6RouterAdvert,
header.ICMPv6RedirectMsg:
- if header.IPv6(netHeader).HopLimit() != ndpHopLimit {
+ if iph.HopLimit() != header.NDPHopLimit {
received.Invalid.Increment()
return
}
@@ -121,25 +113,72 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
case header.ICMPv6NeighborSolicit:
received.NeighborSolicit.Increment()
-
if len(v) < header.ICMPv6NeighborSolicitMinimumSize {
received.Invalid.Increment()
return
}
- targetAddr := tcpip.Address(v[8:][:header.IPv6AddressSize])
+
+ ns := header.NDPNeighborSolicit(h.NDPPayload())
+ targetAddr := ns.TargetAddress()
+ s := r.Stack()
+ rxNICID := r.NICID()
+
+ isTentative, err := s.IsAddrTentative(rxNICID, targetAddr)
+ if err != nil {
+ // We will only get an error if rxNICID is unrecognized,
+ // which should not happen. For now short-circuit this
+ // packet.
+ //
+ // TODO(b/141002840): Handle this better?
+ return
+ }
+
+ if isTentative {
+ // If the target address is tentative and the source
+ // of the packet is a unicast (specified) address, then
+ // the source of the packet is attempting to perform
+ // address resolution on the target. In this case, the
+ // solicitation is silently ignored, as per RFC 4862
+ // section 5.4.3.
+ //
+ // If the target address is tentative and the source of
+ // the packet is the unspecified address (::), then we
+ // know another node is also performing DAD for the
+ // same address (since targetAddr is tentative for us,
+ // we know we are also performing DAD on it). In this
+ // case we let the stack know so it can handle such a
+ // scenario and do nothing further with the NDP NS.
+ if iph.SourceAddress() == header.IPv6Any {
+ s.DupTentativeAddrDetected(rxNICID, targetAddr)
+ }
+
+ // Do not handle neighbor solicitations targeted
+ // to an address that is tentative on the received
+ // NIC any further.
+ return
+ }
+
+ // At this point we know that targetAddr is not tentative on
+ // rxNICID so the packet is processed as defined in RFC 4861,
+ // as per RFC 4862 section 5.4.3.
+
if e.linkAddrCache.CheckLocalAddress(e.nicid, ProtocolNumber, targetAddr) == 0 {
// We don't have a useful answer; the best we can do is ignore the request.
return
}
- hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertSize)
+ optsSerializer := header.NDPOptionsSerializer{
+ header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress[:]),
+ }
+ hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()))
pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
pkt.SetType(header.ICMPv6NeighborAdvert)
- pkt[icmpV6FlagOffset] = ndpSolicitedFlag | ndpOverrideFlag
- copy(pkt[icmpV6OptOffset-len(targetAddr):], targetAddr)
- pkt[icmpV6OptOffset] = ndpOptDstLinkAddr
- pkt[icmpV6LengthOffset] = 1
- copy(pkt[icmpV6LengthOffset+1:], r.LocalLinkAddress[:])
+ na := header.NDPNeighborAdvert(pkt.NDPPayload())
+ na.SetSolicitedFlag(true)
+ na.SetOverrideFlag(true)
+ na.SetTargetAddress(targetAddr)
+ opts := na.Options()
+ opts.Serialize(optsSerializer)
// ICMPv6 Neighbor Solicit messages are always sent to
// specially crafted IPv6 multicast addresses. As a result, the
@@ -154,7 +193,22 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
r.LocalAddress = targetAddr
pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
- if err := r.WritePacket(nil /* gso */, hdr, buffer.VectorisedView{}, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}); err != nil {
+ // TODO(tamird/ghanan): there exists an explicit NDP option that is
+ // used to update the neighbor table with link addresses for a
+ // neighbor from an NS (see the Source Link Layer option RFC
+ // 4861 section 4.6.1 and section 7.2.3).
+ //
+ // Furthermore, the entirety of NDP handling here seems to be
+ // contradicted by RFC 4861.
+ e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress)
+
+ // RFC 4861 Neighbor Discovery for IP version 6 (IPv6)
+ //
+ // 7.1.2. Validation of Neighbor Advertisements
+ //
+ // The IP Hop Limit field has a value of 255, i.e., the packet
+ // could not possibly have been forwarded by a router.
+ if err := r.WritePacket(nil /* gso */, hdr, buffer.VectorisedView{}, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}); err != nil {
sent.Dropped.Increment()
return
}
@@ -166,7 +220,42 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
received.Invalid.Increment()
return
}
- targetAddr := tcpip.Address(v[8:][:header.IPv6AddressSize])
+
+ na := header.NDPNeighborAdvert(h.NDPPayload())
+ targetAddr := na.TargetAddress()
+ stack := r.Stack()
+ rxNICID := r.NICID()
+
+ isTentative, err := stack.IsAddrTentative(rxNICID, targetAddr)
+ if err != nil {
+ // We will only get an error if rxNICID is unrecognized,
+ // which should not happen. For now short-circuit this
+ // packet.
+ //
+ // TODO(b/141002840): Handle this better?
+ return
+ }
+
+ if isTentative {
+ // We just got an NA from a node that owns an address we
+ // are performing DAD on, implying the address is not
+ // unique. In this case we let the stack know so it can
+ // handle such a scenario and do nothing furthur with
+ // the NDP NA.
+ stack.DupTentativeAddrDetected(rxNICID, targetAddr)
+ return
+ }
+
+ // At this point we know that the targetAddress is not tentative
+ // on rxNICID. However, targetAddr may still be assigned to
+ // rxNICID but not tentative (it could be permanent). Such a
+ // scenario is beyond the scope of RFC 4862. As such, we simply
+ // ignore such a scenario for now and proceed as normal.
+ //
+ // TODO(b/143147598): Handle the scenario described above. Also
+ // inform the netstack integration that a duplicate address was
+ // detected outside of DAD.
+
e.linkAddrCache.AddLinkAddress(e.nicid, targetAddr, r.RemoteLinkAddress)
if targetAddr != r.RemoteAddress {
e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress)
@@ -178,7 +267,6 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
received.Invalid.Increment()
return
}
-
vv.TrimFront(header.ICMPv6EchoMinimumSize)
hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize)
pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
@@ -262,7 +350,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
ip.Encode(&header.IPv6Fields{
PayloadLength: length,
NextHeader: uint8(header.ICMPv6ProtocolNumber),
- HopLimit: ndpHopLimit,
+ HopLimit: header.NDPHopLimit,
SrcAddr: r.LocalAddress,
DstAddr: r.RemoteAddress,
})
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 501be208e..7c11dde55 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -15,7 +15,6 @@
package ipv6
import (
- "fmt"
"reflect"
"strings"
"testing"
@@ -144,7 +143,7 @@ func TestICMPCounts(t *testing.T) {
ip.Encode(&header.IPv6Fields{
PayloadLength: uint16(payloadLength),
NextHeader: uint8(header.ICMPv6ProtocolNumber),
- HopLimit: ndpHopLimit,
+ HopLimit: header.NDPHopLimit,
SrcAddr: r.LocalAddress,
DstAddr: r.RemoteAddress,
})
@@ -179,13 +178,10 @@ func visitStats(v reflect.Value, f func(string, *tcpip.StatCounter)) {
t := v.Type()
for i := 0; i < v.NumField(); i++ {
v := v.Field(i)
- switch v.Kind() {
- case reflect.Ptr:
- f(t.Field(i).Name, v.Interface().(*tcpip.StatCounter))
- case reflect.Struct:
+ if s, ok := v.Interface().(*tcpip.StatCounter); ok {
+ f(t.Field(i).Name, s)
+ } else {
visitStats(v, f)
- default:
- panic(fmt.Sprintf("unexpected type %s", v.Type()))
}
}
}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index cd1e34085..5898f8f9e 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -97,9 +97,8 @@ func (e *endpoint) GSOMaxSize() uint32 {
return 0
}
-// WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
- length := uint16(hdr.UsedLength() + payload.Size())
+func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) {
+ length := uint16(hdr.UsedLength() + payloadSize)
ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
ip.Encode(&header.IPv6Fields{
PayloadLength: length,
@@ -109,6 +108,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
SrcAddr: r.LocalAddress,
DstAddr: r.RemoteAddress,
})
+}
+
+// WritePacket writes a packet to the given destination address and protocol.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error {
+ e.addIPHeader(r, &hdr, payload.Size(), params)
if loop&stack.PacketLoop != 0 {
views := make([]buffer.View, 1, 1+len(payload.Views()))
@@ -127,6 +131,26 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
return e.linkEP.WritePacket(r, gso, hdr, payload, ProtocolNumber)
}
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
+ if loop&stack.PacketLoop != 0 {
+ panic("not implemented")
+ }
+ if loop&stack.PacketOut == 0 {
+ return len(hdrs), nil
+ }
+
+ for i := range hdrs {
+ hdr := &hdrs[i].Hdr
+ size := hdrs[i].Size
+ e.addIPHeader(r, hdr, size, params)
+ }
+
+ n, err := e.linkEP.WritePackets(r, gso, hdrs, payload, ProtocolNumber)
+ r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+ return n, err
+}
+
// WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
// supported by IPv6.
func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index e30791fe3..c32716f2e 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -150,7 +150,7 @@ func TestHopLimitValidation(t *testing.T) {
// Receive the NDP packet with an invalid hop limit
// value.
- handleIPv6Payload(hdr, ndpHopLimit-1, ep, &r)
+ handleIPv6Payload(hdr, header.NDPHopLimit-1, ep, &r)
// Invalid count should have increased.
if got := invalid.Value(); got != 1 {
@@ -164,7 +164,7 @@ func TestHopLimitValidation(t *testing.T) {
}
// Receive the NDP packet with a valid hop limit value.
- handleIPv6Payload(hdr, ndpHopLimit, ep, &r)
+ handleIPv6Payload(hdr, header.NDPHopLimit, ep, &r)
// Rx count of NDP packet of type typ.typ should have
// increased.
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 6a78432c9..460db3cf8 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -22,6 +22,7 @@ go_library(
"icmp_rate_limit.go",
"linkaddrcache.go",
"linkaddrentry_list.go",
+ "ndp.go",
"nic.go",
"registration.go",
"route.go",
@@ -53,6 +54,7 @@ go_test(
name = "stack_x_test",
size = "small",
srcs = [
+ "ndp_test.go",
"stack_test.go",
"transport_demuxer_test.go",
"transport_test.go",
@@ -61,14 +63,17 @@ go_test(
":stack",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
+ "//pkg/tcpip/checker",
"//pkg/tcpip/header",
"//pkg/tcpip/iptables",
"//pkg/tcpip/link/channel",
"//pkg/tcpip/link/loopback",
"//pkg/tcpip/network/ipv4",
"//pkg/tcpip/network/ipv6",
+ "//pkg/tcpip/transport/icmp",
"//pkg/tcpip/transport/udp",
"//pkg/waiter",
+ "@com_github_google_go-cmp//cmp:go_default_library",
],
)
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
new file mode 100644
index 000000000..ea2dbed2e
--- /dev/null
+++ b/pkg/tcpip/stack/ndp.go
@@ -0,0 +1,319 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+ "fmt"
+ "log"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/buffer"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+const (
+ // defaultDupAddrDetectTransmits is the default number of NDP Neighbor
+ // Solicitation messages to send when doing Duplicate Address Detection
+ // for a tentative address.
+ //
+ // Default = 1 (from RFC 4862 section 5.1)
+ defaultDupAddrDetectTransmits = 1
+
+ // defaultRetransmitTimer is the default amount of time to wait between
+ // sending NDP Neighbor solicitation messages.
+ //
+ // Default = 1s (from RFC 4861 section 10).
+ defaultRetransmitTimer = time.Second
+
+ // minimumRetransmitTimer is the minimum amount of time to wait between
+ // sending NDP Neighbor solicitation messages. Note, RFC 4861 does
+ // not impose a minimum Retransmit Timer, but we do here to make sure
+ // the messages are not sent all at once. We also come to this value
+ // because in the RetransmitTimer field of a Router Advertisement, a
+ // value of 0 means unspecified, so the smallest valid value is 1.
+ // Note, the unit of the RetransmitTimer field in the Router
+ // Advertisement is milliseconds.
+ //
+ // Min = 1ms.
+ minimumRetransmitTimer = time.Millisecond
+)
+
+// NDPDispatcher is the interface integrators of netstack must implement to
+// receive and handle NDP related events.
+type NDPDispatcher interface {
+ // OnDuplicateAddressDetectionStatus will be called when the DAD process
+ // for an address (addr) on a NIC (with ID nicid) completes. resolved
+ // will be set to true if DAD completed successfully (no duplicate addr
+ // detected); false otherwise (addr was detected to be a duplicate on
+ // the link the NIC is a part of, or it was stopped for some other
+ // reason, such as the address being removed). If an error occured
+ // during DAD, err will be set and resolved must be ignored.
+ //
+ // This function is permitted to block indefinitely without interfering
+ // with the stack's operation.
+ OnDuplicateAddressDetectionStatus(nicid tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
+}
+
+// NDPConfigurations is the NDP configurations for the netstack.
+type NDPConfigurations struct {
+ // The number of Neighbor Solicitation messages to send when doing
+ // Duplicate Address Detection for a tentative address.
+ //
+ // Note, a value of zero effectively disables DAD.
+ DupAddrDetectTransmits uint8
+
+ // The amount of time to wait between sending Neighbor solicitation
+ // messages.
+ //
+ // Must be greater than 0.5s.
+ RetransmitTimer time.Duration
+}
+
+// DefaultNDPConfigurations returns an NDPConfigurations populated with
+// default values.
+func DefaultNDPConfigurations() NDPConfigurations {
+ return NDPConfigurations{
+ DupAddrDetectTransmits: defaultDupAddrDetectTransmits,
+ RetransmitTimer: defaultRetransmitTimer,
+ }
+}
+
+// validate modifies an NDPConfigurations with valid values. If invalid values
+// are present in c, the corresponding default values will be used instead.
+//
+// If RetransmitTimer is less than minimumRetransmitTimer, then a value of
+// defaultRetransmitTimer will be used.
+func (c *NDPConfigurations) validate() {
+ if c.RetransmitTimer < minimumRetransmitTimer {
+ c.RetransmitTimer = defaultRetransmitTimer
+ }
+}
+
+// ndpState is the per-interface NDP state.
+type ndpState struct {
+ // The NIC this ndpState is for.
+ nic *NIC
+
+ // The DAD state to send the next NS message, or resolve the address.
+ dad map[tcpip.Address]dadState
+}
+
+// dadState holds the Duplicate Address Detection timer and channel to signal
+// to the DAD goroutine that DAD should stop.
+type dadState struct {
+ // The DAD timer to send the next NS message, or resolve the address.
+ timer *time.Timer
+
+ // Used to let the DAD timer know that it has been stopped.
+ //
+ // Must only be read from or written to while protected by the lock of
+ // the NIC this dadState is associated with.
+ done *bool
+}
+
+// startDuplicateAddressDetection performs Duplicate Address Detection.
+//
+// This function must only be called by IPv6 addresses that are currently
+// tentative.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
+ // addr must be a valid unicast IPv6 address.
+ if !header.IsV6UnicastAddress(addr) {
+ return tcpip.ErrAddressFamilyNotSupported
+ }
+
+ // Should not attempt to perform DAD on an address that is currently in
+ // the DAD process.
+ if _, ok := ndp.dad[addr]; ok {
+ // Should never happen because we should only ever call this
+ // function for newly created addresses. If we attemped to
+ // "add" an address that already existed, we would returned an
+ // error since we attempted to add a duplicate address, or its
+ // reference count would have been increased without doing the
+ // work that would have been done for an address that was brand
+ // new. See NIC.addPermanentAddressLocked.
+ panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID()))
+ }
+
+ remaining := ndp.nic.stack.ndpConfigs.DupAddrDetectTransmits
+
+ {
+ done, err := ndp.doDuplicateAddressDetection(addr, remaining, ref)
+ if err != nil {
+ return err
+ }
+ if done {
+ return nil
+ }
+ }
+
+ remaining--
+
+ var done bool
+ var timer *time.Timer
+ timer = time.AfterFunc(ndp.nic.stack.ndpConfigs.RetransmitTimer, func() {
+ var d bool
+ var err *tcpip.Error
+
+ // doDadIteration does a single iteration of the DAD loop.
+ //
+ // Returns true if the integrator needs to be informed of DAD
+ // completing.
+ doDadIteration := func() bool {
+ ndp.nic.mu.Lock()
+ defer ndp.nic.mu.Unlock()
+
+ if done {
+ // If we reach this point, it means that the DAD
+ // timer fired after another goroutine already
+ // obtained the NIC lock and stopped DAD before
+ // this function obtained the NIC lock. Simply
+ // return here and do nothing further.
+ return false
+ }
+
+ ref, ok := ndp.nic.endpoints[NetworkEndpointID{addr}]
+ if !ok {
+ // This should never happen.
+ // We should have an endpoint for addr since we
+ // are still performing DAD on it. If the
+ // endpoint does not exist, but we are doing DAD
+ // on it, then we started DAD at some point, but
+ // forgot to stop it when the endpoint was
+ // deleted.
+ panic(fmt.Sprintf("ndpdad: unrecognized addr %s for NIC(%d)", addr, ndp.nic.ID()))
+ }
+
+ d, err = ndp.doDuplicateAddressDetection(addr, remaining, ref)
+ if err != nil || d {
+ delete(ndp.dad, addr)
+
+ if err != nil {
+ log.Printf("ndpdad: Error occured during DAD iteration for addr (%s) on NIC(%d); err = %s", addr, ndp.nic.ID(), err)
+ }
+
+ // Let the integrator know DAD has completed.
+ return true
+ }
+
+ remaining--
+ timer.Reset(ndp.nic.stack.ndpConfigs.RetransmitTimer)
+ return false
+ }
+
+ if doDadIteration() && ndp.nic.stack.ndpDisp != nil {
+ ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, d, err)
+ }
+
+ })
+
+ ndp.dad[addr] = dadState{
+ timer: timer,
+ done: &done,
+ }
+
+ return nil
+}
+
+// doDuplicateAddressDetection is called on every iteration of the timer, and
+// when DAD starts.
+//
+// It handles resolving the address (if there are no more NS to send), or
+// sending the next NS if there are more NS to send.
+//
+// This function must only be called by IPv6 addresses that are currently
+// tentative.
+//
+// The NIC that ndp belongs to (n) MUST be locked.
+//
+// Returns true if DAD has resolved; false if DAD is still ongoing.
+func (ndp *ndpState) doDuplicateAddressDetection(addr tcpip.Address, remaining uint8, ref *referencedNetworkEndpoint) (bool, *tcpip.Error) {
+ if ref.getKind() != permanentTentative {
+ // The endpoint should still be marked as tentative
+ // since we are still performing DAD on it.
+ panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID()))
+ }
+
+ if remaining == 0 {
+ // DAD has resolved.
+ ref.setKind(permanent)
+ return true, nil
+ }
+
+ // Send a new NS.
+ snmc := header.SolicitedNodeAddr(addr)
+ snmcRef, ok := ndp.nic.endpoints[NetworkEndpointID{snmc}]
+ if !ok {
+ // This should never happen as if we have the
+ // address, we should have the solicited-node
+ // address.
+ panic(fmt.Sprintf("ndpdad: NIC(%d) is not in the solicited-node multicast group (%s) but it has addr %s", ndp.nic.ID(), snmc, addr))
+ }
+
+ // Use the unspecified address as the source address when performing
+ // DAD.
+ r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), snmcRef, false, false)
+
+ hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
+ pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
+ pkt.SetType(header.ICMPv6NeighborSolicit)
+ ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+ ns.SetTargetAddress(addr)
+ pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+ sent := r.Stats().ICMP.V6PacketsSent
+ if err := r.WritePacket(nil, hdr, buffer.VectorisedView{}, NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: DefaultTOS}); err != nil {
+ sent.Dropped.Increment()
+ return false, err
+ }
+ sent.NeighborSolicit.Increment()
+
+ return false, nil
+}
+
+// stopDuplicateAddressDetection ends a running Duplicate Address Detection
+// process. Note, this may leave the DAD process for a tentative address in
+// such a state forever, unless some other external event resolves the DAD
+// process (receiving an NA from the true owner of addr, or an NS for addr
+// (implying another node is attempting to use addr)). It is up to the caller
+// of this function to handle such a scenario. Normally, addr will be removed
+// from n right after this function returns or the address successfully
+// resolved.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
+ dad, ok := ndp.dad[addr]
+ if !ok {
+ // Not currently performing DAD on addr, just return.
+ return
+ }
+
+ if dad.timer != nil {
+ dad.timer.Stop()
+ dad.timer = nil
+
+ *dad.done = true
+ dad.done = nil
+ }
+
+ delete(ndp.dad, addr)
+
+ // Let the integrator know DAD did not resolve.
+ if ndp.nic.stack.ndpDisp != nil {
+ go ndp.nic.stack.ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil)
+ }
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
new file mode 100644
index 000000000..b089ce2ae
--- /dev/null
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -0,0 +1,443 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack_test
+
+import (
+ "testing"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/buffer"
+ "gvisor.dev/gvisor/pkg/tcpip/checker"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/link/channel"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+)
+
+const (
+ addr1 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+ addr2 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+ linkAddr1 = "\x02\x02\x03\x04\x05\x06"
+)
+
+// TestDADDisabled tests that an address successfully resolves immediately
+// when DAD is not enabled (the default for an empty stack.Options).
+func TestDADDisabled(t *testing.T) {
+ opts := stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ }
+
+ e := channel.New(10, 1280, linkAddr1)
+ s := stack.New(opts)
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(_) = %s", err)
+ }
+
+ if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
+ t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+ }
+
+ // Should get the address immediately since we should not have performed
+ // DAD on it.
+ addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+ }
+ if addr.Address != addr1 {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, addr1)
+ }
+
+ // We should not have sent any NDP NS messages.
+ if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != 0 {
+ t.Fatalf("got NeighborSolicit = %d, want = 0", got)
+ }
+}
+
+// ndpDADEvent is a set of parameters that was passed to
+// ndpDispatcher.OnDuplicateAddressDetectionStatus.
+type ndpDADEvent struct {
+ nicid tcpip.NICID
+ addr tcpip.Address
+ resolved bool
+ err *tcpip.Error
+}
+
+var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
+
+// ndpDispatcher implements NDPDispatcher so tests can know when various NDP
+// related events happen for test purposes.
+type ndpDispatcher struct {
+ dadC chan ndpDADEvent
+}
+
+// Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.
+//
+// If the DAD event matches what we are expecting, send signal on n.dadC.
+func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicid tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) {
+ n.dadC <- ndpDADEvent{
+ nicid,
+ addr,
+ resolved,
+ err,
+ }
+}
+
+// TestDADResolve tests that an address successfully resolves after performing
+// DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
+// Included in the subtests is a test to make sure that an invalid
+// RetransmitTimer (<1ms) values get fixed to the default RetransmitTimer of 1s.
+func TestDADResolve(t *testing.T) {
+ tests := []struct {
+ name string
+ dupAddrDetectTransmits uint8
+ retransTimer time.Duration
+ expectedRetransmitTimer time.Duration
+ }{
+ {"1:1s:1s", 1, time.Second, time.Second},
+ {"2:1s:1s", 2, time.Second, time.Second},
+ {"1:2s:2s", 1, 2 * time.Second, 2 * time.Second},
+ // 0s is an invalid RetransmitTimer timer and will be fixed to
+ // the default RetransmitTimer value of 1s.
+ {"1:0s:1s", 1, 0, time.Second},
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ndpDisp := ndpDispatcher{
+ dadC: make(chan ndpDADEvent),
+ }
+ opts := stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPDisp: &ndpDisp,
+ }
+ opts.NDPConfigs.RetransmitTimer = test.retransTimer
+ opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
+
+ e := channel.New(10, 1280, linkAddr1)
+ s := stack.New(opts)
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(_) = %s", err)
+ }
+
+ if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
+ t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+ }
+
+ stat := s.Stats().ICMP.V6PacketsSent.NeighborSolicit
+
+ // Should have sent an NDP NS immediately.
+ if got := stat.Value(); got != 1 {
+ t.Fatalf("got NeighborSolicit = %d, want = 1", got)
+
+ }
+
+ // Address should not be considered bound to the NIC yet
+ // (DAD ongoing).
+ addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+ }
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+ }
+
+ // Wait for the remaining time - some delta (500ms), to
+ // make sure the address is still not resolved.
+ const delta = 500 * time.Millisecond
+ time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - delta)
+ addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+ }
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+ }
+
+ // Wait for DAD to resolve.
+ select {
+ case <-time.After(2 * delta):
+ // We should get a resolution event after 500ms
+ // (delta) since we wait for 500ms less than the
+ // expected resolution time above to make sure
+ // that the address did not yet resolve. Waiting
+ // for 1s (2x delta) without a resolution event
+ // means something is wrong.
+ t.Fatal("timed out waiting for DAD resolution")
+ case e := <-ndpDisp.dadC:
+ if e.err != nil {
+ t.Fatal("got DAD error: ", e.err)
+ }
+ if e.nicid != 1 {
+ t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+ }
+ if e.addr != addr1 {
+ t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+ }
+ if !e.resolved {
+ t.Fatal("got DAD event w/ resolved = false, want = true")
+ }
+ }
+ addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+ }
+ if addr.Address != addr1 {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, addr1)
+ }
+
+ // Should not have sent any more NS messages.
+ if got := stat.Value(); got != uint64(test.dupAddrDetectTransmits) {
+ t.Fatalf("got NeighborSolicit = %d, want = %d", got, test.dupAddrDetectTransmits)
+ }
+
+ // Validate the sent Neighbor Solicitation messages.
+ for i := uint8(0); i < test.dupAddrDetectTransmits; i++ {
+ p := <-e.C
+
+ // Make sure its an IPv6 packet.
+ if p.Proto != header.IPv6ProtocolNumber {
+ t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+ }
+
+ // Check NDP packet.
+ checker.IPv6(t, p.Header.ToVectorisedView().First(),
+ checker.TTL(header.NDPHopLimit),
+ checker.NDPNS(
+ checker.NDPNSTargetAddress(addr1)))
+ }
+ })
+ }
+
+}
+
+// TestDADFail tests to make sure that the DAD process fails if another node is
+// detected to be performing DAD on the same address (receive an NS message from
+// a node doing DAD for the same address), or if another node is detected to own
+// the address already (receive an NA message for the tentative address).
+func TestDADFail(t *testing.T) {
+ tests := []struct {
+ name string
+ makeBuf func(tgt tcpip.Address) buffer.Prependable
+ getStat func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+ }{
+ {
+ "RxSolicit",
+ func(tgt tcpip.Address) buffer.Prependable {
+ hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborSolicitMinimumSize)
+ pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
+ pkt.SetType(header.ICMPv6NeighborSolicit)
+ ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+ ns.SetTargetAddress(tgt)
+ snmc := header.SolicitedNodeAddr(tgt)
+ pkt.SetChecksum(header.ICMPv6Checksum(pkt, header.IPv6Any, snmc, buffer.VectorisedView{}))
+ payloadLength := hdr.UsedLength()
+ ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ PayloadLength: uint16(payloadLength),
+ NextHeader: uint8(icmp.ProtocolNumber6),
+ HopLimit: 255,
+ SrcAddr: header.IPv6Any,
+ DstAddr: snmc,
+ })
+
+ return hdr
+
+ },
+ func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+ return s.NeighborSolicit
+ },
+ },
+ {
+ "RxAdvert",
+ func(tgt tcpip.Address) buffer.Prependable {
+ hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
+ pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+ pkt.SetType(header.ICMPv6NeighborAdvert)
+ na := header.NDPNeighborAdvert(pkt.NDPPayload())
+ na.SetSolicitedFlag(true)
+ na.SetOverrideFlag(true)
+ na.SetTargetAddress(tgt)
+ pkt.SetChecksum(header.ICMPv6Checksum(pkt, tgt, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
+ payloadLength := hdr.UsedLength()
+ ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+ ip.Encode(&header.IPv6Fields{
+ PayloadLength: uint16(payloadLength),
+ NextHeader: uint8(icmp.ProtocolNumber6),
+ HopLimit: 255,
+ SrcAddr: tgt,
+ DstAddr: header.IPv6AllNodesMulticastAddress,
+ })
+
+ return hdr
+
+ },
+ func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+ return s.NeighborAdvert
+ },
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ndpDisp := ndpDispatcher{
+ dadC: make(chan ndpDADEvent),
+ }
+ ndpConfigs := stack.DefaultNDPConfigurations()
+ opts := stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: ndpConfigs,
+ NDPDisp: &ndpDisp,
+ }
+ opts.NDPConfigs.RetransmitTimer = time.Second * 2
+
+ e := channel.New(10, 1280, linkAddr1)
+ s := stack.New(opts)
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(_) = %s", err)
+ }
+
+ if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
+ t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+ }
+
+ // Address should not be considered bound to the NIC yet
+ // (DAD ongoing).
+ addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+ }
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+ }
+
+ // Receive a packet to simulate multiple nodes owning or
+ // attempting to own the same address.
+ hdr := test.makeBuf(addr1)
+ e.Inject(header.IPv6ProtocolNumber, hdr.View().ToVectorisedView())
+
+ stat := test.getStat(s.Stats().ICMP.V6PacketsReceived)
+ if got := stat.Value(); got != 1 {
+ t.Fatalf("got stat = %d, want = 1", got)
+ }
+
+ // Wait for DAD to fail and make sure the address did
+ // not get resolved.
+ select {
+ case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+ // If we don't get a failure event after the
+ // expected resolution time + extra 1s buffer,
+ // something is wrong.
+ t.Fatal("timed out waiting for DAD failure")
+ case e := <-ndpDisp.dadC:
+ if e.err != nil {
+ t.Fatal("got DAD error: ", e.err)
+ }
+ if e.nicid != 1 {
+ t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+ }
+ if e.addr != addr1 {
+ t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+ }
+ if e.resolved {
+ t.Fatal("got DAD event w/ resolved = true, want = false")
+ }
+ }
+ addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+ }
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+ }
+ })
+ }
+}
+
+// TestDADStop tests to make sure that the DAD process stops when an address is
+// removed.
+func TestDADStop(t *testing.T) {
+ ndpDisp := ndpDispatcher{
+ dadC: make(chan ndpDADEvent),
+ }
+ ndpConfigs := stack.NDPConfigurations{
+ RetransmitTimer: time.Second,
+ DupAddrDetectTransmits: 2,
+ }
+ opts := stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPDisp: &ndpDisp,
+ NDPConfigs: ndpConfigs,
+ }
+
+ e := channel.New(10, 1280, linkAddr1)
+ s := stack.New(opts)
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(_) = %s", err)
+ }
+
+ if err := s.AddAddress(1, header.IPv6ProtocolNumber, addr1); err != nil {
+ t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr1, err)
+ }
+
+ // Address should not be considered bound to the NIC yet (DAD ongoing).
+ addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+ }
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+ }
+
+ // Remove the address. This should stop DAD.
+ if err := s.RemoveAddress(1, addr1); err != nil {
+ t.Fatalf("RemoveAddress(_, %s) = %s", addr1, err)
+ }
+
+ // Wait for DAD to fail (since the address was removed during DAD).
+ select {
+ case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+ // If we don't get a failure event after the expected resolution
+ // time + extra 1s buffer, something is wrong.
+ t.Fatal("timed out waiting for DAD failure")
+ case e := <-ndpDisp.dadC:
+ if e.err != nil {
+ t.Fatal("got DAD error: ", e.err)
+ }
+ if e.nicid != 1 {
+ t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+ }
+ if e.addr != addr1 {
+ t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, addr1)
+ }
+ if e.resolved {
+ t.Fatal("got DAD event w/ resolved = true, want = false")
+ }
+
+ }
+ addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+ }
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+ }
+
+ // Should not have sent more than 1 NS message.
+ if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got > 1 {
+ t.Fatalf("got NeighborSolicit = %d, want <= 1", got)
+ }
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 5993fe582..2d29fa88e 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -19,7 +19,6 @@ import (
"sync"
"sync/atomic"
- "gvisor.dev/gvisor/pkg/ilist"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -37,12 +36,17 @@ type NIC struct {
mu sync.RWMutex
spoofing bool
promiscuous bool
- primary map[tcpip.NetworkProtocolNumber]*ilist.List
+ primary map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
endpoints map[NetworkEndpointID]*referencedNetworkEndpoint
addressRanges []tcpip.Subnet
mcastJoins map[NetworkEndpointID]int32
+ // packetEPs is protected by mu, but the contained PacketEndpoint
+ // values are not.
+ packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint
stats NICStats
+
+ ndp ndpState
}
// NICStats includes transmitted and received stats.
@@ -77,15 +81,19 @@ const (
)
func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback bool) *NIC {
- return &NIC{
+ // TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For
+ // example, make sure that the link address it provides is a valid
+ // unicast ethernet address.
+ nic := &NIC{
stack: stack,
id: id,
name: name,
linkEP: ep,
loopback: loopback,
- primary: make(map[tcpip.NetworkProtocolNumber]*ilist.List),
+ primary: make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint),
endpoints: make(map[NetworkEndpointID]*referencedNetworkEndpoint),
mcastJoins: make(map[NetworkEndpointID]int32),
+ packetEPs: make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint),
stats: NICStats{
Tx: DirectionStats{
Packets: &tcpip.StatCounter{},
@@ -96,7 +104,21 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
Bytes: &tcpip.StatCounter{},
},
},
+ ndp: ndpState{
+ dad: make(map[tcpip.Address]dadState),
+ },
+ }
+ nic.ndp.nic = nic
+
+ // Register supported packet endpoint protocols.
+ for _, netProto := range header.Ethertypes {
+ nic.packetEPs[netProto] = []PacketEndpoint{}
}
+ for _, netProto := range stack.networkProtocols {
+ nic.packetEPs[netProto.Number()] = []PacketEndpoint{}
+ }
+
+ return nic
}
// enable enables the NIC. enable will attach the link to its LinkEndpoint and
@@ -121,11 +143,50 @@ func (n *NIC) enable() *tcpip.Error {
// when we perform Duplicate Address Detection, or Router Advertisement
// when we do Router Discovery. See RFC 4862, section 5.4.2 and RFC 4861
// section 4.2 for more information.
- if _, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]; ok {
- return n.joinGroup(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress)
+ //
+ // Also auto-generate an IPv6 link-local address based on the NIC's
+ // link address if it is configured to do so. Note, each interface is
+ // required to have IPv6 link-local unicast address, as per RFC 4291
+ // section 2.1.
+ _, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]
+ if !ok {
+ return nil
}
- return nil
+ n.mu.Lock()
+ defer n.mu.Unlock()
+
+ if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
+ return err
+ }
+
+ if !n.stack.autoGenIPv6LinkLocal {
+ return nil
+ }
+
+ l2addr := n.linkEP.LinkAddress()
+
+ // Only attempt to generate the link-local address if we have a
+ // valid MAC address.
+ //
+ // TODO(b/141011931): Validate a LinkEndpoint's link address
+ // (provided by LinkEndpoint.LinkAddress) before reaching this
+ // point.
+ if !header.IsValidUnicastEthernetAddress(l2addr) {
+ return nil
+ }
+
+ addr := header.LinkLocalAddr(l2addr)
+
+ _, err := n.addPermanentAddressLocked(tcpip.ProtocolAddress{
+ Protocol: header.IPv6ProtocolNumber,
+ AddressWithPrefix: tcpip.AddressWithPrefix{
+ Address: addr,
+ PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
+ },
+ }, CanBePrimaryEndpoint)
+
+ return err
}
// attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
@@ -161,18 +222,7 @@ func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedN
n.mu.RLock()
defer n.mu.RUnlock()
- list := n.primary[protocol]
- if list == nil {
- return nil
- }
-
- for e := list.Front(); e != nil; e = e.Next() {
- r := e.(*referencedNetworkEndpoint)
- // TODO(crawshaw): allow broadcast address when SO_BROADCAST is set.
- switch r.ep.ID().LocalAddress {
- case header.IPv4Broadcast, header.IPv4Any:
- continue
- }
+ for _, r := range n.primary[protocol] {
if r.isValidForOutgoing() && r.tryIncRef() {
return r
}
@@ -282,13 +332,35 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
id := NetworkEndpointID{protocolAddress.AddressWithPrefix.Address}
if ref, ok := n.endpoints[id]; ok {
switch ref.getKind() {
- case permanent:
+ case permanentTentative, permanent:
// The NIC already have a permanent endpoint with that address.
return nil, tcpip.ErrDuplicateAddress
case permanentExpired, temporary:
- // Promote the endpoint to become permanent.
+ // Promote the endpoint to become permanent and respect
+ // the new peb.
if ref.tryIncRef() {
ref.setKind(permanent)
+
+ refs := n.primary[ref.protocol]
+ for i, r := range refs {
+ if r == ref {
+ switch peb {
+ case CanBePrimaryEndpoint:
+ return ref, nil
+ case FirstPrimaryEndpoint:
+ if i == 0 {
+ return ref, nil
+ }
+ n.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+ case NeverPrimaryEndpoint:
+ n.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+ return ref, nil
+ }
+ }
+ }
+
+ n.insertPrimaryEndpointLocked(ref, peb)
+
return ref, nil
}
// tryIncRef failing means the endpoint is scheduled to be removed once
@@ -298,6 +370,7 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
n.removeEndpointLocked(ref)
}
}
+
return n.addAddressLocked(protocolAddress, peb, permanent)
}
@@ -321,6 +394,15 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
if err != nil {
return nil, err
}
+
+ isIPv6Unicast := protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address)
+
+ // If the address is an IPv6 address and it is a permanent address,
+ // mark it as tentative so it goes through the DAD process.
+ if isIPv6Unicast && kind == permanent {
+ kind = permanentTentative
+ }
+
ref := &referencedNetworkEndpoint{
refs: 1,
ep: ep,
@@ -338,7 +420,7 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
// If we are adding an IPv6 unicast address, join the solicited-node
// multicast address.
- if protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address) {
+ if isIPv6Unicast {
snmc := header.SolicitedNodeAddr(protocolAddress.AddressWithPrefix.Address)
if err := n.joinGroupLocked(protocolAddress.Protocol, snmc); err != nil {
return nil, err
@@ -347,17 +429,13 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
n.endpoints[id] = ref
- l, ok := n.primary[protocolAddress.Protocol]
- if !ok {
- l = &ilist.List{}
- n.primary[protocolAddress.Protocol] = l
- }
+ n.insertPrimaryEndpointLocked(ref, peb)
- switch peb {
- case CanBePrimaryEndpoint:
- l.PushBack(ref)
- case FirstPrimaryEndpoint:
- l.PushFront(ref)
+ // If we are adding a tentative IPv6 address, start DAD.
+ if isIPv6Unicast && kind == permanentTentative {
+ if err := n.ndp.startDuplicateAddressDetection(protocolAddress.AddressWithPrefix.Address, ref); err != nil {
+ return nil, err
+ }
}
return ref, nil
@@ -382,10 +460,12 @@ func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
addrs := make([]tcpip.ProtocolAddress, 0, len(n.endpoints))
for nid, ref := range n.endpoints {
- // Don't include expired or temporary endpoints to avoid confusion and
- // prevent the caller from using those.
+ // Don't include tentative, expired or temporary endpoints to
+ // avoid confusion and prevent the caller from using those.
switch ref.getKind() {
- case permanentExpired, temporary:
+ case permanentTentative, permanentExpired, temporary:
+ // TODO(b/140898488): Should tentative addresses be
+ // returned?
continue
}
addrs = append(addrs, tcpip.ProtocolAddress{
@@ -406,12 +486,12 @@ func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress {
var addrs []tcpip.ProtocolAddress
for proto, list := range n.primary {
- for e := list.Front(); e != nil; e = e.Next() {
- ref := e.(*referencedNetworkEndpoint)
- // Don't include expired or tempory endpoints to avoid confusion and
- // prevent the caller from using those.
+ for _, ref := range list {
+ // Don't include tentative, expired or tempory endpoints
+ // to avoid confusion and prevent the caller from using
+ // those.
switch ref.getKind() {
- case permanentExpired, temporary:
+ case permanentTentative, permanentExpired, temporary:
continue
}
@@ -471,6 +551,19 @@ func (n *NIC) AddressRanges() []tcpip.Subnet {
return append(sns, n.addressRanges...)
}
+// insertPrimaryEndpointLocked adds r to n's primary endpoint list as required
+// by peb.
+//
+// n MUST be locked.
+func (n *NIC) insertPrimaryEndpointLocked(r *referencedNetworkEndpoint, peb PrimaryEndpointBehavior) {
+ switch peb {
+ case CanBePrimaryEndpoint:
+ n.primary[r.protocol] = append(n.primary[r.protocol], r)
+ case FirstPrimaryEndpoint:
+ n.primary[r.protocol] = append([]*referencedNetworkEndpoint{r}, n.primary[r.protocol]...)
+ }
+}
+
func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
id := *r.ep.ID()
@@ -488,9 +581,12 @@ func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
}
delete(n.endpoints, id)
- wasInList := r.Next() != nil || r.Prev() != nil || r == n.primary[r.protocol].Front()
- if wasInList {
- n.primary[r.protocol].Remove(r)
+ refs := n.primary[r.protocol]
+ for i, ref := range refs {
+ if ref == r {
+ n.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+ break
+ }
}
r.ep.Close()
@@ -504,10 +600,22 @@ func (n *NIC) removeEndpoint(r *referencedNetworkEndpoint) {
func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
r, ok := n.endpoints[NetworkEndpointID{addr}]
- if !ok || r.getKind() != permanent {
+ if !ok {
return tcpip.ErrBadLocalAddress
}
+ kind := r.getKind()
+ if kind != permanent && kind != permanentTentative {
+ return tcpip.ErrBadLocalAddress
+ }
+
+ isIPv6Unicast := r.protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(addr)
+
+ // If we are removing a tentative IPv6 unicast address, stop DAD.
+ if isIPv6Unicast && kind == permanentTentative {
+ n.ndp.stopDuplicateAddressDetection(addr)
+ }
+
r.setKind(permanentExpired)
if !r.decRefLocked() {
// The endpoint still has references to it.
@@ -518,7 +626,7 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
// If we are removing an IPv6 unicast address, leave the solicited-node
// multicast address.
- if r.protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(addr) {
+ if isIPv6Unicast {
snmc := header.SolicitedNodeAddr(addr)
if err := n.leaveGroupLocked(snmc); err != nil {
return err
@@ -548,6 +656,11 @@ func (n *NIC) joinGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address
// exists yet. Otherwise it just increments its count. n MUST be locked before
// joinGroupLocked is called.
func (n *NIC) joinGroupLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
+ // TODO(b/143102137): When implementing MLD, make sure MLD packets are
+ // not sent unless a valid link-local address is available for use on n
+ // as an MLD packet's source address must be a link-local address as
+ // outlined in RFC 3810 section 5.
+
id := NetworkEndpointID{addr}
joins := n.mcastJoins[id]
if joins == 0 {
@@ -611,7 +724,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
// Note that the ownership of the slice backing vv is retained by the caller.
// This rule applies only to the slice itself, not to the items of the slice;
// the ownership of the items is not retained by the caller.
-func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View) {
n.stats.Rx.Packets.Increment()
n.stats.Rx.Bytes.IncrementBy(uint64(vv.Size()))
@@ -621,6 +734,26 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
return
}
+ // If no local link layer address is provided, assume it was sent
+ // directly to this NIC.
+ if local == "" {
+ local = n.linkEP.LinkAddress()
+ }
+
+ // Are any packet sockets listening for this network protocol?
+ n.mu.RLock()
+ packetEPs := n.packetEPs[protocol]
+ // Check whether there are packet sockets listening for every protocol.
+ // If we received a packet with protocol EthernetProtocolAll, then the
+ // previous for loop will have handled it.
+ if protocol != header.EthernetProtocolAll {
+ packetEPs = append(packetEPs, n.packetEPs[header.EthernetProtocolAll]...)
+ }
+ n.mu.RUnlock()
+ for _, ep := range packetEPs {
+ ep.HandlePacket(n.id, local, protocol, vv, linkHeader)
+ }
+
if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber {
n.stack.stats.IP.PacketsReceived.Increment()
}
@@ -632,8 +765,6 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
src, dst := netProto.ParseAddresses(vv.First())
- n.stack.AddLinkAddress(n.id, src, remote)
-
if ref := n.getRef(protocol, dst); ref != nil {
handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, vv)
return
@@ -682,7 +813,10 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
return
}
- n.stack.stats.IP.InvalidAddressesReceived.Increment()
+ // If a packet socket handled the packet, don't treat it as invalid.
+ if len(packetEPs) == 0 {
+ n.stack.stats.IP.InvalidAddressesReceived.Increment()
+ }
}
// DeliverTransportPacket delivers the packets to the appropriate transport
@@ -769,14 +903,58 @@ func (n *NIC) Stack() *Stack {
return n.stack
}
+// isAddrTentative returns true if addr is tentative on n.
+//
+// Note that if addr is not associated with n, then this function will return
+// false. It will only return true if the address is associated with the NIC
+// AND it is tentative.
+func (n *NIC) isAddrTentative(addr tcpip.Address) bool {
+ ref, ok := n.endpoints[NetworkEndpointID{addr}]
+ if !ok {
+ return false
+ }
+
+ return ref.getKind() == permanentTentative
+}
+
+// dupTentativeAddrDetected attempts to inform n that a tentative addr
+// is a duplicate on a link.
+//
+// dupTentativeAddrDetected will delete the tentative address if it exists.
+func (n *NIC) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
+ n.mu.Lock()
+ defer n.mu.Unlock()
+
+ ref, ok := n.endpoints[NetworkEndpointID{addr}]
+ if !ok {
+ return tcpip.ErrBadAddress
+ }
+
+ if ref.getKind() != permanentTentative {
+ return tcpip.ErrInvalidEndpointState
+ }
+
+ return n.removePermanentAddressLocked(addr)
+}
+
type networkEndpointKind int32
const (
+ // A permanentTentative endpoint is a permanent address that is not yet
+ // considered to be fully bound to an interface in the traditional
+ // sense. That is, the address is associated with a NIC, but packets
+ // destined to the address MUST NOT be accepted and MUST be silently
+ // dropped, and the address MUST NOT be used as a source address for
+ // outgoing packets. For IPv6, addresses will be of this kind until
+ // NDP's Duplicate Address Detection has resolved, or be deleted if
+ // the process results in detecting a duplicate address.
+ permanentTentative networkEndpointKind = iota
+
// A permanent endpoint is created by adding a permanent address (vs. a
// temporary one) to the NIC. Its reference count is biased by 1 to avoid
// removal when no route holds a reference to it. It is removed by explicitly
// removing the permanent address from the NIC.
- permanent networkEndpointKind = iota
+ permanent
// An expired permanent endoint is a permanent endoint that had its address
// removed from the NIC, and it is waiting to be removed once no more routes
@@ -794,8 +972,37 @@ const (
temporary
)
+func (n *NIC) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error {
+ n.mu.Lock()
+ defer n.mu.Unlock()
+
+ eps, ok := n.packetEPs[netProto]
+ if !ok {
+ return tcpip.ErrNotSupported
+ }
+ n.packetEPs[netProto] = append(eps, ep)
+
+ return nil
+}
+
+func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
+ n.mu.Lock()
+ defer n.mu.Unlock()
+
+ eps, ok := n.packetEPs[netProto]
+ if !ok {
+ return
+ }
+
+ for i, epOther := range eps {
+ if epOther == ep {
+ n.packetEPs[netProto] = append(eps[:i], eps[i+1:]...)
+ return
+ }
+ }
+}
+
type referencedNetworkEndpoint struct {
- ilist.Entry
ep NetworkEndpoint
nic *NIC
protocol tcpip.NetworkProtocolNumber
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 9d6157f22..0869fb084 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -71,8 +71,8 @@ type TransportEndpoint interface {
// RawTransportEndpoint is the interface that needs to be implemented by raw
// transport protocol endpoints. RawTransportEndpoints receive the entire
-// packet - including the link, network, and transport headers - as delivered
-// to netstack.
+// packet - including the network and transport headers - as delivered to
+// netstack.
type RawTransportEndpoint interface {
// HandlePacket is called by the stack when new packets arrive to
// this transport endpoint. The packet contains all data from the link
@@ -80,6 +80,22 @@ type RawTransportEndpoint interface {
HandlePacket(r *Route, netHeader buffer.View, packet buffer.VectorisedView)
}
+// PacketEndpoint is the interface that needs to be implemented by packet
+// transport protocol endpoints. These endpoints receive link layer headers in
+// addition to whatever they contain (usually network and transport layer
+// headers and a payload).
+type PacketEndpoint interface {
+ // HandlePacket is called by the stack when new packets arrive that
+ // match the endpoint.
+ //
+ // Implementers should treat packet as immutable and should copy it
+ // before before modification.
+ //
+ // linkHeader may have a length of 0, in which case the PacketEndpoint
+ // should construct its own ethernet header for applications.
+ HandlePacket(nicid tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, packet buffer.VectorisedView, linkHeader buffer.View)
+}
+
// TransportProtocol is the interface that needs to be implemented by transport
// protocols (e.g., tcp, udp) that want to be part of the networking stack.
type TransportProtocol interface {
@@ -185,6 +201,10 @@ type NetworkEndpoint interface {
// protocol.
WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params NetworkHeaderParams, loop PacketLooping) *tcpip.Error
+ // WritePackets writes packets to the given destination address and
+ // protocol.
+ WritePackets(r *Route, gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, params NetworkHeaderParams, loop PacketLooping) (int, *tcpip.Error)
+
// WriteHeaderIncludedPacket writes a packet that includes a network
// header to the given destination address.
WriteHeaderIncludedPacket(r *Route, payload buffer.VectorisedView, loop PacketLooping) *tcpip.Error
@@ -242,9 +262,10 @@ type NetworkProtocol interface {
// packets to the appropriate network endpoint after it has been handled by
// the data link layer.
type NetworkDispatcher interface {
- // DeliverNetworkPacket finds the appropriate network protocol
- // endpoint and hands the packet over for further processing.
- DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView)
+ // DeliverNetworkPacket finds the appropriate network protocol endpoint
+ // and hands the packet over for further processing. linkHeader may have
+ // length 0 when the caller does not have ethernet data.
+ DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, linkHeader buffer.View)
}
// LinkEndpointCapabilities is the type associated with the capabilities
@@ -266,7 +287,11 @@ const (
CapabilitySaveRestore
CapabilityDisconnectOk
CapabilityLoopback
- CapabilityGSO
+ CapabilityHardwareGSO
+
+ // CapabilitySoftwareGSO indicates the link endpoint supports of sending
+ // multiple packets using a single call (LinkEndpoint.WritePackets).
+ CapabilitySoftwareGSO
)
// LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
@@ -301,6 +326,18 @@ type LinkEndpoint interface {
// r.LocalLinkAddress if it is provided.
WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error
+ // WritePackets writes packets with the given protocol through the
+ // given route.
+ //
+ // Right now, WritePackets is used only when the software segmentation
+ // offload is enabled. If it will be used for something else, it may
+ // require to change syscall filters.
+ WritePackets(r *Route, gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
+
+ // WriteRawPacket writes a packet directly to the link. The packet
+ // should already have an ethernet header.
+ WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error
+
// Attach attaches the data link layer endpoint to the network-layer
// dispatcher of the stack.
Attach(dispatcher NetworkDispatcher)
@@ -324,13 +361,14 @@ type LinkEndpoint interface {
type InjectableLinkEndpoint interface {
LinkEndpoint
- // Inject injects an inbound packet.
- Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView)
+ // InjectInbound injects an inbound packet.
+ InjectInbound(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView)
- // WriteRawPacket writes a fully formed outbound packet directly to the link.
+ // InjectOutbound writes a fully formed outbound packet directly to the
+ // link.
//
// dest is used by endpoints with multiple raw destinations.
- WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error
+ InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error
}
// A LinkAddressResolver is an extension to a NetworkProtocol that
@@ -379,11 +417,16 @@ type LinkAddressCache interface {
RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker)
}
-// UnassociatedEndpointFactory produces endpoints for writing packets not
-// associated with a particular transport protocol. Such endpoints can be used
-// to write arbitrary packets that include the IP header.
-type UnassociatedEndpointFactory interface {
- NewUnassociatedRawEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+// RawFactory produces endpoints for writing various types of raw packets.
+type RawFactory interface {
+ // NewUnassociatedEndpoint produces endpoints for writing packets not
+ // associated with a particular transport protocol. Such endpoints can
+ // be used to write arbitrary packets that include the network header.
+ NewUnassociatedEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+
+ // NewPacketEndpoint produces endpoints for reading and writing packets
+ // that include network and (when cooked is false) link layer headers.
+ NewPacketEndpoint(stack *Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
}
// GSOType is the type of GSO segments.
@@ -394,8 +437,14 @@ type GSOType int
// Types of gso segments.
const (
GSONone GSOType = iota
+
+ // Hardware GSO types:
GSOTCPv4
GSOTCPv6
+
+ // GSOSW is used for software GSO segments which have to be sent by
+ // endpoint.WritePackets.
+ GSOSW
)
// GSO contains generic segmentation offload properties.
@@ -423,3 +472,7 @@ type GSOEndpoint interface {
// GSOMaxSize returns the maximum GSO packet size.
GSOMaxSize() uint32
}
+
+// SoftwareGSOMaxSize is a maximum allowed size of a software GSO segment.
+// This isn't a hard limit, because it is never set into packet headers.
+const SoftwareGSOMaxSize = (1 << 16)
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index e72373964..1a0a51b57 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -47,8 +47,8 @@ type Route struct {
// starts.
ref *referencedNetworkEndpoint
- // loop controls where WritePacket should send packets.
- loop PacketLooping
+ // Loop controls where WritePacket should send packets.
+ Loop PacketLooping
}
// makeRoute initializes a new route. It takes ownership of the provided
@@ -69,7 +69,7 @@ func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip
LocalLinkAddress: localLinkAddr,
RemoteAddress: remoteAddr,
ref: ref,
- loop: loop,
+ Loop: loop,
}
}
@@ -159,7 +159,7 @@ func (r *Route) WritePacket(gso *GSO, hdr buffer.Prependable, payload buffer.Vec
return tcpip.ErrInvalidEndpointState
}
- err := r.ref.ep.WritePacket(r, gso, hdr, payload, params, r.loop)
+ err := r.ref.ep.WritePacket(r, gso, hdr, payload, params, r.Loop)
if err != nil {
r.Stats().IP.OutgoingPacketErrors.Increment()
} else {
@@ -169,6 +169,44 @@ func (r *Route) WritePacket(gso *GSO, hdr buffer.Prependable, payload buffer.Vec
return err
}
+// PacketDescriptor is a packet descriptor which contains a packet header and
+// offset and size of packet data in a payload view.
+type PacketDescriptor struct {
+ Hdr buffer.Prependable
+ Off int
+ Size int
+}
+
+// NewPacketDescriptors allocates a set of packet descriptors.
+func NewPacketDescriptors(n int, hdrSize int) []PacketDescriptor {
+ buf := make([]byte, n*hdrSize)
+ hdrs := make([]PacketDescriptor, n)
+ for i := range hdrs {
+ hdrs[i].Hdr = buffer.NewEmptyPrependableFromView(buf[i*hdrSize:][:hdrSize])
+ }
+ return hdrs
+}
+
+// WritePackets writes the set of packets through the given route.
+func (r *Route) WritePackets(gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, params NetworkHeaderParams) (int, *tcpip.Error) {
+ if !r.ref.isValidForOutgoing() {
+ return 0, tcpip.ErrInvalidEndpointState
+ }
+
+ n, err := r.ref.ep.WritePackets(r, gso, hdrs, payload, params, r.Loop)
+ if err != nil {
+ r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(len(hdrs) - n))
+ }
+ r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n))
+ payloadSize := 0
+ for i := 0; i < n; i++ {
+ r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(hdrs[i].Hdr.UsedLength()))
+ payloadSize += hdrs[i].Size
+ }
+ r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(payloadSize))
+ return n, err
+}
+
// WriteHeaderIncludedPacket writes a packet already containing a network
// header through the given route.
func (r *Route) WriteHeaderIncludedPacket(payload buffer.VectorisedView) *tcpip.Error {
@@ -176,7 +214,7 @@ func (r *Route) WriteHeaderIncludedPacket(payload buffer.VectorisedView) *tcpip.
return tcpip.ErrInvalidEndpointState
}
- if err := r.ref.ep.WriteHeaderIncludedPacket(r, payload, r.loop); err != nil {
+ if err := r.ref.ep.WriteHeaderIncludedPacket(r, payload, r.Loop); err != nil {
r.Stats().IP.OutgoingPacketErrors.Increment()
return err
}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index f67975525..5ea432a24 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -351,10 +351,9 @@ type Stack struct {
networkProtocols map[tcpip.NetworkProtocolNumber]NetworkProtocol
linkAddrResolvers map[tcpip.NetworkProtocolNumber]LinkAddressResolver
- // unassociatedFactory creates unassociated endpoints. If nil, raw
- // endpoints are disabled. It is set during Stack creation and is
- // immutable.
- unassociatedFactory UnassociatedEndpointFactory
+ // rawFactory creates raw endpoints. If nil, raw endpoints are
+ // disabled. It is set during Stack creation and is immutable.
+ rawFactory RawFactory
demux *transportDemuxer
@@ -399,6 +398,18 @@ type Stack struct {
//
// TODO(gvisor.dev/issue/940): S/R this field.
portSeed uint32
+
+ // ndpConfigs is the NDP configurations used by interfaces.
+ ndpConfigs NDPConfigurations
+
+ // autoGenIPv6LinkLocal determines whether or not the stack will attempt
+ // to auto-generate an IPv6 link-local address for newly enabled NICs.
+ // See the AutoGenIPv6LinkLocal field of Options for more details.
+ autoGenIPv6LinkLocal bool
+
+ // ndpDisp is the NDP event dispatcher that is used to send the netstack
+ // integrator NDP related events.
+ ndpDisp NDPDispatcher
}
// Options contains optional Stack configuration.
@@ -422,9 +433,32 @@ type Options struct {
// stack (false).
HandleLocal bool
- // UnassociatedFactory produces unassociated endpoints raw endpoints.
- // Raw endpoints are enabled only if this is non-nil.
- UnassociatedFactory UnassociatedEndpointFactory
+ // NDPConfigs is the NDP configurations used by interfaces.
+ //
+ // By default, NDPConfigs will have a zero value for its
+ // DupAddrDetectTransmits field, implying that DAD will not be performed
+ // before assigning an address to a NIC.
+ NDPConfigs NDPConfigurations
+
+ // AutoGenIPv6LinkLocal determins whether or not the stack will attempt
+ // to auto-generate an IPv6 link-local address for newly enabled NICs.
+ // Note, setting this to true does not mean that a link-local address
+ // will be assigned right away, or at all. If Duplicate Address
+ // Detection is enabled, an address will only be assigned if it
+ // successfully resolves. If it fails, no further attempt will be made
+ // to auto-generate an IPv6 link-local address.
+ //
+ // The generated link-local address will follow RFC 4291 Appendix A
+ // guidelines.
+ AutoGenIPv6LinkLocal bool
+
+ // NDPDisp is the NDP event dispatcher that an integrator can provide to
+ // receive NDP related events.
+ NDPDisp NDPDispatcher
+
+ // RawFactory produces raw endpoints. Raw endpoints are enabled only if
+ // this is non-nil.
+ RawFactory RawFactory
}
// TransportEndpointInfo holds useful information about a transport endpoint
@@ -458,6 +492,9 @@ func (*TransportEndpointInfo) IsEndpointInfo() {}
// New allocates a new networking stack with only the requested networking and
// transport protocols configured with default options.
//
+// Note, NDPConfigurations will be fixed before being used by the Stack. That
+// is, if an invalid value was provided, it will be reset to the default value.
+//
// Protocol options can be changed by calling the
// SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the
// stack. Please refer to individual protocol implementations as to what options
@@ -468,18 +505,24 @@ func New(opts Options) *Stack {
clock = &tcpip.StdClock{}
}
+ // Make sure opts.NDPConfigs contains valid values only.
+ opts.NDPConfigs.validate()
+
s := &Stack{
- transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
- networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
- linkAddrResolvers: make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
- nics: make(map[tcpip.NICID]*NIC),
- linkAddrCache: newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
- PortManager: ports.NewPortManager(),
- clock: clock,
- stats: opts.Stats.FillIn(),
- handleLocal: opts.HandleLocal,
- icmpRateLimiter: NewICMPRateLimiter(),
- portSeed: generateRandUint32(),
+ transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
+ networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
+ linkAddrResolvers: make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
+ nics: make(map[tcpip.NICID]*NIC),
+ linkAddrCache: newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
+ PortManager: ports.NewPortManager(),
+ clock: clock,
+ stats: opts.Stats.FillIn(),
+ handleLocal: opts.HandleLocal,
+ icmpRateLimiter: NewICMPRateLimiter(),
+ portSeed: generateRandUint32(),
+ ndpConfigs: opts.NDPConfigs,
+ autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
+ ndpDisp: opts.NDPDisp,
}
// Add specified network protocols.
@@ -497,8 +540,8 @@ func New(opts Options) *Stack {
}
}
- // Add the factory for unassociated endpoints, if present.
- s.unassociatedFactory = opts.UnassociatedFactory
+ // Add the factory for raw endpoints, if present.
+ s.rawFactory = opts.RawFactory
// Create the global transport demuxer.
s.demux = newTransportDemuxer(s)
@@ -633,12 +676,12 @@ func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcp
// protocol. Raw endpoints receive all traffic for a given protocol regardless
// of address.
func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) {
- if s.unassociatedFactory == nil {
+ if s.rawFactory == nil {
return nil, tcpip.ErrNotPermitted
}
if !associated {
- return s.unassociatedFactory.NewUnassociatedRawEndpoint(s, network, transport, waiterQueue)
+ return s.rawFactory.NewUnassociatedEndpoint(s, network, transport, waiterQueue)
}
t, ok := s.transportProtocols[transport]
@@ -649,6 +692,16 @@ func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network
return t.proto.NewRawEndpoint(s, network, waiterQueue)
}
+// NewPacketEndpoint creates a new packet endpoint listening for the given
+// netProto.
+func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+ if s.rawFactory == nil {
+ return nil, tcpip.ErrNotPermitted
+ }
+
+ return s.rawFactory.NewPacketEndpoint(s, cooked, netProto, waiterQueue)
+}
+
// createNIC creates a NIC with the provided id and link-layer endpoint, and
// optionally enable it.
func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled, loopback bool) *tcpip.Error {
@@ -1118,6 +1171,109 @@ func (s *Stack) Resume() {
}
}
+// RegisterPacketEndpoint registers ep with the stack, causing it to receive
+// all traffic of the specified netProto on the given NIC. If nicID is 0, it
+// receives traffic from every NIC.
+func (s *Stack) RegisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // If no NIC is specified, capture on all devices.
+ if nicID == 0 {
+ // Register with each NIC.
+ for _, nic := range s.nics {
+ if err := nic.registerPacketEndpoint(netProto, ep); err != nil {
+ s.unregisterPacketEndpointLocked(0, netProto, ep)
+ return err
+ }
+ }
+ return nil
+ }
+
+ // Capture on a specific device.
+ nic, ok := s.nics[nicID]
+ if !ok {
+ return tcpip.ErrUnknownNICID
+ }
+ if err := nic.registerPacketEndpoint(netProto, ep); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// UnregisterPacketEndpoint unregisters ep for packets of the specified
+// netProto from the specified NIC. If nicID is 0, ep is unregistered from all
+// NICs.
+func (s *Stack) UnregisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ s.unregisterPacketEndpointLocked(nicID, netProto, ep)
+}
+
+func (s *Stack) unregisterPacketEndpointLocked(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
+ // If no NIC is specified, unregister on all devices.
+ if nicID == 0 {
+ // Unregister with each NIC.
+ for _, nic := range s.nics {
+ nic.unregisterPacketEndpoint(netProto, ep)
+ }
+ return
+ }
+
+ // Unregister in a single device.
+ nic, ok := s.nics[nicID]
+ if !ok {
+ return
+ }
+ nic.unregisterPacketEndpoint(netProto, ep)
+}
+
+// WritePacket writes data directly to the specified NIC. It adds an ethernet
+// header based on the arguments.
+func (s *Stack) WritePacket(nicid tcpip.NICID, dst tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error {
+ s.mu.Lock()
+ nic, ok := s.nics[nicid]
+ s.mu.Unlock()
+ if !ok {
+ return tcpip.ErrUnknownDevice
+ }
+
+ // Add our own fake ethernet header.
+ ethFields := header.EthernetFields{
+ SrcAddr: nic.linkEP.LinkAddress(),
+ DstAddr: dst,
+ Type: netProto,
+ }
+ fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
+ fakeHeader.Encode(&ethFields)
+ ethHeader := buffer.View(fakeHeader).ToVectorisedView()
+ ethHeader.Append(payload)
+
+ if err := nic.linkEP.WriteRawPacket(ethHeader); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// WriteRawPacket writes data directly to the specified NIC without adding any
+// headers.
+func (s *Stack) WriteRawPacket(nicid tcpip.NICID, payload buffer.VectorisedView) *tcpip.Error {
+ s.mu.Lock()
+ nic, ok := s.nics[nicid]
+ s.mu.Unlock()
+ if !ok {
+ return tcpip.ErrUnknownDevice
+ }
+
+ if err := nic.linkEP.WriteRawPacket(payload); err != nil {
+ return err
+ }
+
+ return nil
+}
+
// NetworkProtocolInstance returns the protocol instance in the stack for the
// specified network protocol. This method is public for protocol implementers
// and tests to use.
@@ -1238,6 +1394,37 @@ func (s *Stack) AllowICMPMessage() bool {
return s.icmpRateLimiter.Allow()
}
+// IsAddrTentative returns true if addr is tentative on the NIC with ID id.
+//
+// Note that if addr is not associated with a NIC with id ID, then this
+// function will return false. It will only return true if the address is
+// associated with the NIC AND it is tentative.
+func (s *Stack) IsAddrTentative(id tcpip.NICID, addr tcpip.Address) (bool, *tcpip.Error) {
+ s.mu.RLock()
+ defer s.mu.RUnlock()
+
+ nic, ok := s.nics[id]
+ if !ok {
+ return false, tcpip.ErrUnknownNICID
+ }
+
+ return nic.isAddrTentative(addr), nil
+}
+
+// DupTentativeAddrDetected attempts to inform the NIC with ID id that a
+// tentative addr on it is a duplicate on a link.
+func (s *Stack) DupTentativeAddrDetected(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ nic, ok := s.nics[id]
+ if !ok {
+ return tcpip.ErrUnknownNICID
+ }
+
+ return nic.dupTentativeAddrDetected(addr)
+}
+
// PortSeed returns a 32 bit value that can be used as a seed value for port
// picking.
//
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 10fd1065f..9dae853d0 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -24,11 +24,14 @@ import (
"sort"
"strings"
"testing"
+ "time"
+ "github.com/google/go-cmp/cmp"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
@@ -144,6 +147,11 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr bu
return f.ep.WritePacket(r, gso, hdr, payload, fakeNetNumber)
}
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
+ panic("not implemented")
+}
+
func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error {
return tcpip.ErrNotSupported
}
@@ -1864,3 +1872,297 @@ func TestNICForwarding(t *testing.T) {
t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
}
}
+
+// TestNICAutoGenAddr tests the auto-generation of IPv6 link-local addresses
+// (or lack there-of if disabled (default)). Note, DAD will be disabled in
+// these tests.
+func TestNICAutoGenAddr(t *testing.T) {
+ tests := []struct {
+ name string
+ autoGen bool
+ linkAddr tcpip.LinkAddress
+ shouldGen bool
+ }{
+ {
+ "Disabled",
+ false,
+ linkAddr1,
+ false,
+ },
+ {
+ "Enabled",
+ true,
+ linkAddr1,
+ true,
+ },
+ {
+ "Nil MAC",
+ true,
+ tcpip.LinkAddress([]byte(nil)),
+ false,
+ },
+ {
+ "Empty MAC",
+ true,
+ tcpip.LinkAddress(""),
+ false,
+ },
+ {
+ "Invalid MAC",
+ true,
+ tcpip.LinkAddress("\x01\x02\x03"),
+ false,
+ },
+ {
+ "Multicast MAC",
+ true,
+ tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+ false,
+ },
+ {
+ "Unspecified MAC",
+ true,
+ tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
+ false,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ opts := stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ }
+
+ if test.autoGen {
+ // Only set opts.AutoGenIPv6LinkLocal when
+ // test.autoGen is true because
+ // opts.AutoGenIPv6LinkLocal should be false by
+ // default.
+ opts.AutoGenIPv6LinkLocal = true
+ }
+
+ e := channel.New(10, 1280, test.linkAddr)
+ s := stack.New(opts)
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(_) = %s", err)
+ }
+
+ addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+ }
+
+ if test.shouldGen {
+ // Should have auto-generated an address and
+ // resolved immediately (DAD is disabled).
+ if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(test.linkAddr), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+ }
+ } else {
+ // Should not have auto-generated an address.
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+ }
+ }
+ })
+ }
+}
+
+// TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
+// link-local addresses will only be assigned after the DAD process resolves.
+func TestNICAutoGenAddrDoesDAD(t *testing.T) {
+ ndpDisp := ndpDispatcher{
+ dadC: make(chan ndpDADEvent),
+ }
+ ndpConfigs := stack.DefaultNDPConfigurations()
+ opts := stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: ndpConfigs,
+ AutoGenIPv6LinkLocal: true,
+ NDPDisp: &ndpDisp,
+ }
+
+ e := channel.New(10, 1280, linkAddr1)
+ s := stack.New(opts)
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(_) = %s", err)
+ }
+
+ // Address should not be considered bound to the
+ // NIC yet (DAD ongoing).
+ addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (_, %v), want = (_, nil)", err)
+ }
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+ }
+
+ linkLocalAddr := header.LinkLocalAddr(linkAddr1)
+
+ // Wait for DAD to resolve.
+ select {
+ case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+ // We should get a resolution event after 1s (default time to
+ // resolve as per default NDP configurations). Waiting for that
+ // resolution time + an extra 1s without a resolution event
+ // means something is wrong.
+ t.Fatal("timed out waiting for DAD resolution")
+ case e := <-ndpDisp.dadC:
+ if e.err != nil {
+ t.Fatal("got DAD error: ", e.err)
+ }
+ if e.nicid != 1 {
+ t.Fatalf("got DAD event w/ nicid = %d, want = 1", e.nicid)
+ }
+ if e.addr != linkLocalAddr {
+ t.Fatalf("got DAD event w/ addr = %s, want = %s", addr, linkLocalAddr)
+ }
+ if !e.resolved {
+ t.Fatal("got DAD event w/ resolved = false, want = true")
+ }
+ }
+ addr, err = s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+ }
+ if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+ }
+}
+
+// TestNewPEB tests that a new PrimaryEndpointBehavior value (peb) is respected
+// when an address's kind gets "promoted" to permanent from permanentExpired.
+func TestNewPEBOnPromotionToPermanent(t *testing.T) {
+ pebs := []stack.PrimaryEndpointBehavior{
+ stack.NeverPrimaryEndpoint,
+ stack.CanBePrimaryEndpoint,
+ stack.FirstPrimaryEndpoint,
+ }
+
+ for _, pi := range pebs {
+ for _, ps := range pebs {
+ t.Run(fmt.Sprintf("%d-to-%d", pi, ps), func(t *testing.T) {
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+ })
+ ep1 := channel.New(10, defaultMTU, "")
+ if err := s.CreateNIC(1, ep1); err != nil {
+ t.Fatal("CreateNIC failed:", err)
+ }
+
+ // Add a permanent address with initial
+ // PrimaryEndpointBehavior (peb), pi. If pi is
+ // NeverPrimaryEndpoint, the address should not
+ // be returned by a call to GetMainNICAddress;
+ // else, it should.
+ if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x01", pi); err != nil {
+ t.Fatal("AddAddressWithOptions failed:", err)
+ }
+ addr, err := s.GetMainNICAddress(1, fakeNetNumber)
+ if err != nil {
+ t.Fatal("s.GetMainNICAddress failed:", err)
+ }
+ if pi == stack.NeverPrimaryEndpoint {
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Fatalf("got GetMainNICAddress = %s, want = %s", addr, want)
+
+ }
+ } else if addr.Address != "\x01" {
+ t.Fatalf("got GetMainNICAddress = %s, want = 1", addr.Address)
+ }
+
+ {
+ subnet, err := tcpip.NewSubnet("\x00", "\x00")
+ if err != nil {
+ t.Fatalf("NewSubnet failed:", err)
+ }
+ s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+ }
+
+ // Take a route through the address so its ref
+ // count gets incremented and does not actually
+ // get deleted when RemoveAddress is called
+ // below. This is because we want to test that a
+ // new peb is respected when an address gets
+ // "promoted" to permanent from a
+ // permanentExpired kind.
+ r, err := s.FindRoute(1, "\x01", "\x02", fakeNetNumber, false)
+ if err != nil {
+ t.Fatal("FindRoute failed:", err)
+ }
+ defer r.Release()
+ if err := s.RemoveAddress(1, "\x01"); err != nil {
+ t.Fatalf("RemoveAddress failed:", err)
+ }
+
+ //
+ // At this point, the address should still be
+ // known by the NIC, but have its
+ // kind = permanentExpired.
+ //
+
+ // Add some other address with peb set to
+ // FirstPrimaryEndpoint.
+ if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x03", stack.FirstPrimaryEndpoint); err != nil {
+ t.Fatal("AddAddressWithOptions failed:", err)
+
+ }
+
+ // Add back the address we removed earlier and
+ // make sure the new peb was respected.
+ // (The address should just be promoted now).
+ if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x01", ps); err != nil {
+ t.Fatal("AddAddressWithOptions failed:", err)
+ }
+ var primaryAddrs []tcpip.Address
+ for _, pa := range s.NICInfo()[1].ProtocolAddresses {
+ primaryAddrs = append(primaryAddrs, pa.AddressWithPrefix.Address)
+ }
+ var expectedList []tcpip.Address
+ switch ps {
+ case stack.FirstPrimaryEndpoint:
+ expectedList = []tcpip.Address{
+ "\x01",
+ "\x03",
+ }
+ case stack.CanBePrimaryEndpoint:
+ expectedList = []tcpip.Address{
+ "\x03",
+ "\x01",
+ }
+ case stack.NeverPrimaryEndpoint:
+ expectedList = []tcpip.Address{
+ "\x03",
+ }
+ }
+ if !cmp.Equal(primaryAddrs, expectedList) {
+ t.Fatalf("got NIC's primary addresses = %v, want = %v", primaryAddrs, expectedList)
+ }
+
+ // Once we remove the other address, if the new
+ // peb, ps, was NeverPrimaryEndpoint, no address
+ // should be returned by a call to
+ // GetMainNICAddress; else, our original address
+ // should be returned.
+ if err := s.RemoveAddress(1, "\x03"); err != nil {
+ t.Fatalf("RemoveAddress failed:", err)
+ }
+ addr, err = s.GetMainNICAddress(1, fakeNetNumber)
+ if err != nil {
+ t.Fatal("s.GetMainNICAddress failed:", err)
+ }
+ if ps == stack.NeverPrimaryEndpoint {
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Fatalf("got GetMainNICAddress = %s, want = %s", addr, want)
+
+ }
+ } else {
+ if addr.Address != "\x01" {
+ t.Fatalf("got GetMainNICAddress = %s, want = 1", addr.Address)
+ }
+ }
+ })
+ }
+ }
+}
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 92267ce4d..97a1aec4b 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -465,7 +465,7 @@ func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer
func (d *transportDemuxer) registerRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error {
eps, ok := d.protocol[protocolIDs{netProto, transProto}]
if !ok {
- return nil
+ return tcpip.ErrNotSupported
}
eps.mu.Lock()
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 26f338d8d..353ecd49b 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -255,7 +255,7 @@ type FullAddress struct {
// This may not be used by all endpoint types.
NIC NICID
- // Addr is the network address.
+ // Addr is the network or link layer address.
Addr Address
// Port is the transport port.
@@ -1100,15 +1100,12 @@ func (*TransportEndpointStats) IsEndpointStats() {}
func fillIn(v reflect.Value) {
for i := 0; i < v.NumField(); i++ {
v := v.Field(i)
- switch v.Kind() {
- case reflect.Ptr:
- if s := v.Addr().Interface().(**StatCounter); *s == nil {
- *s = &StatCounter{}
+ if s, ok := v.Addr().Interface().(**StatCounter); ok {
+ if *s == nil {
+ *s = new(StatCounter)
}
- case reflect.Struct:
+ } else {
fillIn(v)
- default:
- panic(fmt.Sprintf("unexpected type %s", v.Type()))
}
}
}
diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD
new file mode 100644
index 000000000..8ea2e6ee5
--- /dev/null
+++ b/pkg/tcpip/transport/packet/BUILD
@@ -0,0 +1,46 @@
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+ name = "packet_list",
+ out = "packet_list.go",
+ package = "packet",
+ prefix = "packet",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*packet",
+ "Linker": "*packet",
+ },
+)
+
+go_library(
+ name = "packet",
+ srcs = [
+ "endpoint.go",
+ "endpoint_state.go",
+ "packet_list.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/packet",
+ imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
+ visibility = ["//visibility:public"],
+ deps = [
+ "//pkg/log",
+ "//pkg/sleep",
+ "//pkg/tcpip",
+ "//pkg/tcpip/buffer",
+ "//pkg/tcpip/header",
+ "//pkg/tcpip/iptables",
+ "//pkg/tcpip/stack",
+ "//pkg/waiter",
+ ],
+)
+
+filegroup(
+ name = "autogen",
+ srcs = [
+ "packet_list.go",
+ ],
+ visibility = ["//:sandbox"],
+)
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
new file mode 100644
index 000000000..73cdaa265
--- /dev/null
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -0,0 +1,363 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package packet provides the implementation of packet sockets (see
+// packet(7)). Packet sockets allow applications to:
+//
+// * manually write and inspect link, network, and transport headers
+// * receive all traffic of a given network protocol, or all protocols
+//
+// Packet sockets are similar to raw sockets, but provide even more power to
+// users, letting them effectively talk directly to the network device.
+//
+// Packet sockets skip the input and output iptables chains.
+package packet
+
+import (
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/buffer"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/iptables"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// +stateify savable
+type packet struct {
+ packetEntry
+ // data holds the actual packet data, including any headers and
+ // payload.
+ data buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+ // views is pre-allocated space to back data. As long as the packet is
+ // made up of fewer than 8 buffer.Views, no extra allocation is
+ // necessary to store packet data.
+ views [8]buffer.View `state:"nosave"`
+ // timestampNS is the unix time at which the packet was received.
+ timestampNS int64
+ // senderAddr is the network address of the sender.
+ senderAddr tcpip.FullAddress
+}
+
+// endpoint is the packet socket implementation of tcpip.Endpoint. It is legal
+// to have goroutines make concurrent calls into the endpoint.
+//
+// Lock order:
+// endpoint.mu
+// endpoint.rcvMu
+//
+// +stateify savable
+type endpoint struct {
+ stack.TransportEndpointInfo
+ // The following fields are initialized at creation time and are
+ // immutable.
+ stack *stack.Stack `state:"manual"`
+ netProto tcpip.NetworkProtocolNumber
+ waiterQueue *waiter.Queue
+ cooked bool
+
+ // The following fields are used to manage the receive queue and are
+ // protected by rcvMu.
+ rcvMu sync.Mutex `state:"nosave"`
+ rcvList packetList
+ rcvBufSizeMax int `state:".(int)"`
+ rcvBufSize int
+ rcvClosed bool
+
+ // The following fields are protected by mu.
+ mu sync.RWMutex `state:"nosave"`
+ sndBufSize int
+ closed bool
+ stats tcpip.TransportEndpointStats `state:"nosave"`
+}
+
+// NewEndpoint returns a new packet endpoint.
+func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+ ep := &endpoint{
+ stack: s,
+ TransportEndpointInfo: stack.TransportEndpointInfo{
+ NetProto: netProto,
+ },
+ cooked: cooked,
+ netProto: netProto,
+ waiterQueue: waiterQueue,
+ rcvBufSizeMax: 32 * 1024,
+ sndBufSize: 32 * 1024,
+ }
+
+ if err := s.RegisterPacketEndpoint(0, netProto, ep); err != nil {
+ return nil, err
+ }
+ return ep, nil
+}
+
+// Close implements tcpip.Endpoint.Close.
+func (ep *endpoint) Close() {
+ ep.mu.Lock()
+ defer ep.mu.Unlock()
+
+ if ep.closed {
+ return
+ }
+
+ ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
+
+ ep.rcvMu.Lock()
+ defer ep.rcvMu.Unlock()
+
+ // Clear the receive list.
+ ep.rcvClosed = true
+ ep.rcvBufSize = 0
+ for !ep.rcvList.Empty() {
+ ep.rcvList.Remove(ep.rcvList.Front())
+ }
+
+ ep.closed = true
+ ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+}
+
+// ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
+func (ep *endpoint) ModerateRecvBuf(copied int) {}
+
+// IPTables implements tcpip.Endpoint.IPTables.
+func (ep *endpoint) IPTables() (iptables.IPTables, error) {
+ return ep.stack.IPTables(), nil
+}
+
+// Read implements tcpip.Endpoint.Read.
+func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+ ep.rcvMu.Lock()
+
+ // If there's no data to read, return that read would block or that the
+ // endpoint is closed.
+ if ep.rcvList.Empty() {
+ err := tcpip.ErrWouldBlock
+ if ep.rcvClosed {
+ ep.stats.ReadErrors.ReadClosed.Increment()
+ err = tcpip.ErrClosedForReceive
+ }
+ ep.rcvMu.Unlock()
+ return buffer.View{}, tcpip.ControlMessages{}, err
+ }
+
+ packet := ep.rcvList.Front()
+ ep.rcvList.Remove(packet)
+ ep.rcvBufSize -= packet.data.Size()
+
+ ep.rcvMu.Unlock()
+
+ if addr != nil {
+ *addr = packet.senderAddr
+ }
+
+ return packet.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: packet.timestampNS}, nil
+}
+
+func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+ // TODO(b/129292371): Implement.
+ return 0, nil, tcpip.ErrInvalidOptionValue
+}
+
+// Peek implements tcpip.Endpoint.Peek.
+func (ep *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+ return 0, tcpip.ControlMessages{}, nil
+}
+
+// Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be
+// disconnected, and this function always returns tpcip.ErrNotSupported.
+func (*endpoint) Disconnect() *tcpip.Error {
+ return tcpip.ErrNotSupported
+}
+
+// Connect implements tcpip.Endpoint.Connect. Packet sockets cannot be
+// connected, and this function always returnes tcpip.ErrNotSupported.
+func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+ return tcpip.ErrNotSupported
+}
+
+// Shutdown implements tcpip.Endpoint.Shutdown. Packet sockets cannot be used
+// with Shutdown, and this function always returns tcpip.ErrNotSupported.
+func (ep *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+ return tcpip.ErrNotSupported
+}
+
+// Listen implements tcpip.Endpoint.Listen. Packet sockets cannot be used with
+// Listen, and this function always returns tcpip.ErrNotSupported.
+func (ep *endpoint) Listen(backlog int) *tcpip.Error {
+ return tcpip.ErrNotSupported
+}
+
+// Accept implements tcpip.Endpoint.Accept. Packet sockets cannot be used with
+// Accept, and this function always returns tcpip.ErrNotSupported.
+func (ep *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+ return nil, nil, tcpip.ErrNotSupported
+}
+
+// Bind implements tcpip.Endpoint.Bind.
+func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
+ // TODO(gvisor.dev/issue/173): Add Bind support.
+
+ // "By default, all packets of the specified protocol type are passed
+ // to a packet socket. To get packets only from a specific interface
+ // use bind(2) specifying an address in a struct sockaddr_ll to bind
+ // the packet socket to an interface. Fields used for binding are
+ // sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex."
+ // - packet(7).
+
+ return tcpip.ErrNotSupported
+}
+
+// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
+func (ep *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+ return tcpip.FullAddress{}, tcpip.ErrNotSupported
+}
+
+// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
+func (ep *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+ // Even a connected socket doesn't return a remote address.
+ return tcpip.FullAddress{}, tcpip.ErrNotConnected
+}
+
+// Readiness implements tcpip.Endpoint.Readiness.
+func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+ // The endpoint is always writable.
+ result := waiter.EventOut & mask
+
+ // Determine whether the endpoint is readable.
+ if (mask & waiter.EventIn) != 0 {
+ ep.rcvMu.Lock()
+ if !ep.rcvList.Empty() || ep.rcvClosed {
+ result |= waiter.EventIn
+ }
+ ep.rcvMu.Unlock()
+ }
+
+ return result
+}
+
+// SetSockOpt implements tcpip.Endpoint.SetSockOpt. Packet sockets cannot be
+// used with SetSockOpt, and this function always returns
+// tcpip.ErrNotSupported.
+func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+ return tcpip.ErrNotSupported
+}
+
+// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
+func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+ return tcpip.ErrUnknownProtocolOption
+}
+
+// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
+func (ep *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+ return 0, tcpip.ErrNotSupported
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+ return tcpip.ErrNotSupported
+}
+
+// HandlePacket implements stack.PacketEndpoint.HandlePacket.
+func (ep *endpoint) HandlePacket(nicid tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, vv buffer.VectorisedView, ethHeader buffer.View) {
+ ep.rcvMu.Lock()
+
+ // Drop the packet if our buffer is currently full.
+ if ep.rcvClosed {
+ ep.rcvMu.Unlock()
+ ep.stack.Stats().DroppedPackets.Increment()
+ ep.stats.ReceiveErrors.ClosedReceiver.Increment()
+ return
+ }
+
+ if ep.rcvBufSize >= ep.rcvBufSizeMax {
+ ep.rcvMu.Unlock()
+ ep.stack.Stats().DroppedPackets.Increment()
+ ep.stats.ReceiveErrors.ReceiveBufferOverflow.Increment()
+ return
+ }
+
+ wasEmpty := ep.rcvBufSize == 0
+
+ // Push new packet into receive list and increment the buffer size.
+ var packet packet
+ // TODO(b/129292371): Return network protocol.
+ if len(ethHeader) > 0 {
+ // Get info directly from the ethernet header.
+ hdr := header.Ethernet(ethHeader)
+ packet.senderAddr = tcpip.FullAddress{
+ NIC: nicid,
+ Addr: tcpip.Address(hdr.SourceAddress()),
+ }
+ } else {
+ // Guess the would-be ethernet header.
+ packet.senderAddr = tcpip.FullAddress{
+ NIC: nicid,
+ Addr: tcpip.Address(localAddr),
+ }
+ }
+
+ if ep.cooked {
+ // Cooked packets can simply be queued.
+ packet.data = vv.Clone(packet.views[:])
+ } else {
+ // Raw packets need their ethernet headers prepended before
+ // queueing.
+ if len(ethHeader) == 0 {
+ // We weren't provided with an actual ethernet header,
+ // so fake one.
+ ethFields := header.EthernetFields{
+ SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}),
+ DstAddr: localAddr,
+ Type: netProto,
+ }
+ fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
+ fakeHeader.Encode(&ethFields)
+ ethHeader = buffer.View(fakeHeader)
+ }
+ combinedVV := buffer.View(ethHeader).ToVectorisedView()
+ combinedVV.Append(vv)
+ packet.data = combinedVV.Clone(packet.views[:])
+ }
+ packet.timestampNS = ep.stack.NowNanoseconds()
+
+ ep.rcvList.PushBack(&packet)
+ ep.rcvBufSize += packet.data.Size()
+
+ ep.rcvMu.Unlock()
+ ep.stats.PacketsReceived.Increment()
+ // Notify waiters that there's data to be read.
+ if wasEmpty {
+ ep.waiterQueue.Notify(waiter.EventIn)
+ }
+}
+
+// State implements socket.Socket.State.
+func (ep *endpoint) State() uint32 {
+ return 0
+}
+
+// Info returns a copy of the endpoint info.
+func (ep *endpoint) Info() tcpip.EndpointInfo {
+ ep.mu.RLock()
+ // Make a copy of the endpoint info.
+ ret := ep.TransportEndpointInfo
+ ep.mu.RUnlock()
+ return &ret
+}
+
+// Stats returns a pointer to the endpoint stats.
+func (ep *endpoint) Stats() tcpip.EndpointStats {
+ return &ep.stats
+}
diff --git a/pkg/tcpip/transport/packet/endpoint_state.go b/pkg/tcpip/transport/packet/endpoint_state.go
new file mode 100644
index 000000000..9b88f17e4
--- /dev/null
+++ b/pkg/tcpip/transport/packet/endpoint_state.go
@@ -0,0 +1,72 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package packet
+
+import (
+ "gvisor.dev/gvisor/pkg/tcpip/buffer"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// saveData saves packet.data field.
+func (p *packet) saveData() buffer.VectorisedView {
+ // We cannot save p.data directly as p.data.views may alias to p.views,
+ // which is not allowed by state framework (in-struct pointer).
+ return p.data.Clone(nil)
+}
+
+// loadData loads packet.data field.
+func (p *packet) loadData(data buffer.VectorisedView) {
+ // NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization
+ // here because data.views is not guaranteed to be loaded by now. Plus,
+ // data.views will be allocated anyway so there really is little point
+ // of utilizing p.views for data.views.
+ p.data = data
+}
+
+// beforeSave is invoked by stateify.
+func (ep *endpoint) beforeSave() {
+ // Stop incoming packets from being handled (and mutate endpoint state).
+ // The lock will be released after saveRcvBufSizeMax(), which would have
+ // saved ep.rcvBufSizeMax and set it to 0 to continue blocking incoming
+ // packets.
+ ep.rcvMu.Lock()
+}
+
+// saveRcvBufSizeMax is invoked by stateify.
+func (ep *endpoint) saveRcvBufSizeMax() int {
+ max := ep.rcvBufSizeMax
+ // Make sure no new packets will be handled regardless of the lock.
+ ep.rcvBufSizeMax = 0
+ // Release the lock acquired in beforeSave() so regular endpoint closing
+ // logic can proceed after save.
+ ep.rcvMu.Unlock()
+ return max
+}
+
+// loadRcvBufSizeMax is invoked by stateify.
+func (ep *endpoint) loadRcvBufSizeMax(max int) {
+ ep.rcvBufSizeMax = max
+}
+
+// afterLoad is invoked by stateify.
+func (ep *endpoint) afterLoad() {
+ // StackFromEnv is a stack used specifically for save/restore.
+ ep.stack = stack.StackFromEnv
+
+ // TODO(gvisor.dev/173): Once bind is supported, choose the right NIC.
+ if err := ep.stack.RegisterPacketEndpoint(0, ep.netProto, ep); err != nil {
+ panic(*err)
+ }
+}
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index fba598d51..4af49218c 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -4,14 +4,14 @@ load("//tools/go_stateify:defs.bzl", "go_library")
package(licenses = ["notice"])
go_template_instance(
- name = "packet_list",
- out = "packet_list.go",
+ name = "raw_packet_list",
+ out = "raw_packet_list.go",
package = "raw",
- prefix = "packet",
+ prefix = "rawPacket",
template = "//pkg/ilist:generic_list",
types = {
- "Element": "*packet",
- "Linker": "*packet",
+ "Element": "*rawPacket",
+ "Linker": "*rawPacket",
},
)
@@ -20,8 +20,8 @@ go_library(
srcs = [
"endpoint.go",
"endpoint_state.go",
- "packet_list.go",
"protocol.go",
+ "raw_packet_list.go",
],
importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/raw",
imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
@@ -34,6 +34,7 @@ go_library(
"//pkg/tcpip/header",
"//pkg/tcpip/iptables",
"//pkg/tcpip/stack",
+ "//pkg/tcpip/transport/packet",
"//pkg/waiter",
],
)
@@ -41,7 +42,7 @@ go_library(
filegroup(
name = "autogen",
srcs = [
- "packet_list.go",
+ "raw_packet_list.go",
],
visibility = ["//:sandbox"],
)
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index b4c660859..308f10d24 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -17,8 +17,7 @@
//
// * manually write and inspect transport layer headers and payloads
// * receive all traffic of a given transport protocol (e.g. ICMP or UDP)
-// * optionally write and inspect network layer and link layer headers for
-// packets
+// * optionally write and inspect network layer headers of packets
//
// Raw sockets don't have any notion of ports, and incoming packets are
// demultiplexed solely by protocol number. Thus, a raw UDP endpoint will
@@ -38,8 +37,8 @@ import (
)
// +stateify savable
-type packet struct {
- packetEntry
+type rawPacket struct {
+ rawPacketEntry
// data holds the actual packet data, including any headers and
// payload.
data buffer.VectorisedView `state:".(buffer.VectorisedView)"`
@@ -72,7 +71,7 @@ type endpoint struct {
// The following fields are used to manage the receive queue and are
// protected by rcvMu.
rcvMu sync.Mutex `state:"nosave"`
- rcvList packetList
+ rcvList rawPacketList
rcvBufSizeMax int `state:".(int)"`
rcvBufSize int
rcvClosed bool
@@ -90,7 +89,6 @@ type endpoint struct {
}
// NewEndpoint returns a raw endpoint for the given protocols.
-// TODO(b/129292371): IP_HDRINCL and AF_PACKET.
func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
return newEndpoint(stack, netProto, transProto, waiterQueue, true /* associated */)
}
@@ -187,17 +185,17 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
return buffer.View{}, tcpip.ControlMessages{}, err
}
- packet := e.rcvList.Front()
- e.rcvList.Remove(packet)
- e.rcvBufSize -= packet.data.Size()
+ pkt := e.rcvList.Front()
+ e.rcvList.Remove(pkt)
+ e.rcvBufSize -= pkt.data.Size()
e.rcvMu.Unlock()
if addr != nil {
- *addr = packet.senderAddr
+ *addr = pkt.senderAddr
}
- return packet.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: packet.timestampNS}, nil
+ return pkt.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: pkt.timestampNS}, nil
}
// Write implements tcpip.Endpoint.Write.
@@ -602,7 +600,7 @@ func (e *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv bu
wasEmpty := e.rcvBufSize == 0
// Push new packet into receive list and increment the buffer size.
- packet := &packet{
+ pkt := &rawPacket{
senderAddr: tcpip.FullAddress{
NIC: route.NICID(),
Addr: route.RemoteAddress,
@@ -611,11 +609,11 @@ func (e *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv bu
combinedVV := netHeader.ToVectorisedView()
combinedVV.Append(vv)
- packet.data = combinedVV.Clone(packet.views[:])
- packet.timestampNS = e.stack.NowNanoseconds()
+ pkt.data = combinedVV.Clone(pkt.views[:])
+ pkt.timestampNS = e.stack.NowNanoseconds()
- e.rcvList.PushBack(packet)
- e.rcvBufSize += packet.data.Size()
+ e.rcvList.PushBack(pkt)
+ e.rcvBufSize += pkt.data.Size()
e.rcvMu.Unlock()
e.stats.PacketsReceived.Increment()
diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go
index a6c7cc43a..33bfb56cd 100644
--- a/pkg/tcpip/transport/raw/endpoint_state.go
+++ b/pkg/tcpip/transport/raw/endpoint_state.go
@@ -20,15 +20,15 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
-// saveData saves packet.data field.
-func (p *packet) saveData() buffer.VectorisedView {
+// saveData saves rawPacket.data field.
+func (p *rawPacket) saveData() buffer.VectorisedView {
// We cannot save p.data directly as p.data.views may alias to p.views,
// which is not allowed by state framework (in-struct pointer).
return p.data.Clone(nil)
}
-// loadData loads packet.data field.
-func (p *packet) loadData(data buffer.VectorisedView) {
+// loadData loads rawPacket.data field.
+func (p *rawPacket) loadData(data buffer.VectorisedView) {
// NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization
// here because data.views is not guaranteed to be loaded by now. Plus,
// data.views will be allocated anyway so there really is little point
@@ -86,7 +86,9 @@ func (ep *endpoint) Resume(s *stack.Stack) {
}
}
- if err := ep.stack.RegisterRawTransportEndpoint(ep.RegisterNICID, ep.NetProto, ep.TransProto, ep); err != nil {
- panic(err)
+ if ep.associated {
+ if err := ep.stack.RegisterRawTransportEndpoint(ep.RegisterNICID, ep.NetProto, ep.TransProto, ep); err != nil {
+ panic(err)
+ }
}
}
diff --git a/pkg/tcpip/transport/raw/protocol.go b/pkg/tcpip/transport/raw/protocol.go
index a2512d666..f30aa2a4a 100644
--- a/pkg/tcpip/transport/raw/protocol.go
+++ b/pkg/tcpip/transport/raw/protocol.go
@@ -17,13 +17,19 @@ package raw
import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/tcpip/transport/packet"
"gvisor.dev/gvisor/pkg/waiter"
)
-// EndpointFactory implements stack.UnassociatedEndpointFactory.
+// EndpointFactory implements stack.RawFactory.
type EndpointFactory struct{}
-// NewUnassociatedRawEndpoint implements stack.UnassociatedEndpointFactory.
-func (EndpointFactory) NewUnassociatedRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+// NewUnassociatedEndpoint implements stack.RawFactory.NewUnassociatedEndpoint.
+func (EndpointFactory) NewUnassociatedEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
return newEndpoint(stack, netProto, transProto, waiterQueue, false /* associated */)
}
+
+// NewPacketEndpoint implements stack.RawFactory.NewPacketEndpoint.
+func (EndpointFactory) NewPacketEndpoint(stack *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+ return packet.NewEndpoint(stack, cooked, netProto, waiterQueue)
+}
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index aed70e06f..f1dbc6f91 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -44,6 +44,7 @@ go_library(
imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
visibility = ["//visibility:public"],
deps = [
+ "//pkg/log",
"//pkg/rand",
"//pkg/sleep",
"//pkg/tcpip",
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index b724d02bb..8db1cc028 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -607,17 +607,11 @@ func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data bu
return nil
}
-// sendTCP sends a TCP segment with the provided options via the provided
-// network endpoint and under the provided identity.
-func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
+func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, d *stack.PacketDescriptor, data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) {
optLen := len(opts)
- // Allocate a buffer for the TCP header.
- hdr := buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen)
-
- if rcvWnd > 0xffff {
- rcvWnd = 0xffff
- }
-
+ hdr := &d.Hdr
+ packetSize := d.Size
+ off := d.Off
// Initialize the header.
tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
tcp.Encode(&header.TCPFields{
@@ -631,7 +625,7 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
})
copy(tcp[header.TCPMinimumSize:], opts)
- length := uint16(hdr.UsedLength() + data.Size())
+ length := uint16(hdr.UsedLength() + packetSize)
xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
// Only calculate the checksum if offloading isn't supported.
if gso != nil && gso.NeedsCsum {
@@ -641,14 +635,71 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
// header and data and get the right sum of the TCP packet.
tcp.SetChecksum(xsum)
} else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
- xsum = header.ChecksumVV(data, xsum)
+ xsum = header.ChecksumVVWithOffset(data, xsum, off, packetSize)
tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
}
+}
+
+func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
+ optLen := len(opts)
+ if rcvWnd > 0xffff {
+ rcvWnd = 0xffff
+ }
+
+ mss := int(gso.MSS)
+ n := (data.Size() + mss - 1) / mss
+
+ hdrs := stack.NewPacketDescriptors(n, header.TCPMinimumSize+int(r.MaxHeaderLength())+optLen)
+
+ size := data.Size()
+ off := 0
+ for i := 0; i < n; i++ {
+ packetSize := mss
+ if packetSize > size {
+ packetSize = size
+ }
+ size -= packetSize
+ hdrs[i].Off = off
+ hdrs[i].Size = packetSize
+ buildTCPHdr(r, id, &hdrs[i], data, flags, seq, ack, rcvWnd, opts, gso)
+ off += packetSize
+ seq = seq.Add(seqnum.Size(packetSize))
+ }
+ if ttl == 0 {
+ ttl = r.DefaultTTL()
+ }
+ sent, err := r.WritePackets(gso, hdrs, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos})
+ if err != nil {
+ r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent))
+ }
+ r.Stats().TCP.SegmentsSent.IncrementBy(uint64(sent))
+ return err
+}
+
+// sendTCP sends a TCP segment with the provided options via the provided
+// network endpoint and under the provided identity.
+func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
+ optLen := len(opts)
+ if rcvWnd > 0xffff {
+ rcvWnd = 0xffff
+ }
+
+ if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
+ return sendTCPBatch(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso)
+ }
+
+ d := &stack.PacketDescriptor{
+ Hdr: buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
+ Off: 0,
+ Size: data.Size(),
+ }
+ buildTCPHdr(r, id, d, data, flags, seq, ack, rcvWnd, opts, gso)
+
if ttl == 0 {
ttl = r.DefaultTTL()
}
- if err := r.WritePacket(gso, hdr, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}); err != nil {
+ if err := r.WritePacket(gso, d.Hdr, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}); err != nil {
r.Stats().TCP.SegmentSendErrors.Increment()
return err
}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 31a22c1eb..c6bc5528c 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -2328,11 +2328,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
return s
}
-func (e *endpoint) initGSO() {
- if e.route.Capabilities()&stack.CapabilityGSO == 0 {
- return
- }
-
+func (e *endpoint) initHardwareGSO() {
gso := &stack.GSO{}
switch e.route.NetProto {
case header.IPv4ProtocolNumber:
@@ -2350,6 +2346,18 @@ func (e *endpoint) initGSO() {
e.gso = gso
}
+func (e *endpoint) initGSO() {
+ if e.route.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+ e.initHardwareGSO()
+ } else if e.route.Capabilities()&stack.CapabilitySoftwareGSO != 0 {
+ e.gso = &stack.GSO{
+ MaxSize: e.route.GSOMaxSize(),
+ Type: stack.GSOSW,
+ NeedsCsum: false,
+ }
+ }
+}
+
// State implements tcpip.Endpoint.State. It exports the endpoint's protocol
// state for diagnostics.
func (e *endpoint) State() uint32 {
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 6e87245b7..91c8487f3 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -140,7 +140,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
stack: s,
TransportEndpointInfo: stack.TransportEndpointInfo{
NetProto: netProto,
- TransProto: header.TCPProtocolNumber,
+ TransProto: header.UDPProtocolNumber,
},
waiterQueue: waiterQueue,
// RFC 1075 section 5.4 recommends a TTL of 1 for membership
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index de026880f..5c3358a5e 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -121,8 +121,15 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
payloadLen = available
}
- payload := buffer.NewVectorisedView(len(netHeader), []buffer.View{netHeader})
- payload.Append(vv)
+ // The buffers used by vv and netHeader may be used elsewhere
+ // in the system. For example, a raw or packet socket may use
+ // what UDP considers an unreachable destination. Thus we deep
+ // copy vv and netHeader to prevent multiple ownership and SR
+ // errors.
+ newNetHeader := make(buffer.View, len(netHeader))
+ copy(newNetHeader, netHeader)
+ payload := buffer.NewVectorisedView(len(newNetHeader), []buffer.View{newNetHeader})
+ payload.Append(vv.ToView().ToVectorisedView())
payload.CapLength(payloadLen)
hdr := buffer.NewPrependable(headerLen)