summaryrefslogtreecommitdiffhomepage
path: root/pkg/tcpip
diff options
context:
space:
mode:
authorBhasker Hariharan <bhaskerh@google.com>2019-05-21 15:23:12 -0700
committerShentubot <shentubot@google.com>2019-05-21 15:24:25 -0700
commit2ac0aeeb42ef517743a18224f9f6945c1c77d732 (patch)
tree3cc0942ae3948485013bc30dbb1404305b501209 /pkg/tcpip
parent9cdae51feca5cee9faa198161b92a0aeece52d6c (diff)
Refactor fdbased endpoint dispatcher code.
This is in preparation to support an fdbased endpoint that can read/dispatch packets from multiple underlying fds. Updates #231 PiperOrigin-RevId: 249337074 Change-Id: Id7d375186cffcf55ae5e38986e7d605a96916d35
Diffstat (limited to 'pkg/tcpip')
-rw-r--r--pkg/tcpip/link/fdbased/BUILD2
-rw-r--r--pkg/tcpip/link/fdbased/endpoint.go239
-rw-r--r--pkg/tcpip/link/fdbased/endpoint_test.go78
-rw-r--r--pkg/tcpip/link/fdbased/mmap.go12
-rw-r--r--pkg/tcpip/link/fdbased/mmap_amd64.go194
-rw-r--r--pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go174
-rw-r--r--pkg/tcpip/link/fdbased/packet_dispatchers.go309
7 files changed, 598 insertions, 410 deletions
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 50ce91a4e..cef98c353 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -8,7 +8,9 @@ go_library(
"endpoint.go",
"endpoint_unsafe.go",
"mmap.go",
+ "mmap_amd64.go",
"mmap_amd64_unsafe.go",
+ "packet_dispatchers.go",
],
importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased",
visibility = [
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 4da376774..1f889c2a0 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -34,18 +34,11 @@ import (
"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
)
-const (
- // MaxMsgsPerRecv is the maximum number of packets we want to retrieve
- // in a single RecvMMsg call.
- MaxMsgsPerRecv = 8
-)
-
-// BufConfig defines the shape of the vectorised view used to read packets from the NIC.
-var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
-
// linkDispatcher reads packets from the link FD and dispatches them to the
// NetworkDispatcher.
-type linkDispatcher func() (bool, *tcpip.Error)
+type linkDispatcher interface {
+ dispatch() (bool, *tcpip.Error)
+}
// PacketDispatchMode are the various supported methods of receiving and
// dispatching packets from the underlying FD.
@@ -92,11 +85,6 @@ type endpoint struct {
// its end of the communication pipe.
closed func(*tcpip.Error)
- views [][]buffer.View
- iovecs [][]syscall.Iovec
- // msgHdrs is only used by the RecvMMsg dispatcher.
- msgHdrs []rawfile.MMsgHdr
-
inboundDispatcher linkDispatcher
dispatcher stack.NetworkDispatcher
@@ -104,14 +92,6 @@ type endpoint struct {
// endpoint.
packetDispatchMode PacketDispatchMode
- // ringBuffer is only used when PacketMMap dispatcher is used and points
- // to the start of the mmapped PACKET_RX_RING buffer.
- ringBuffer []byte
-
- // ringOffset is the current offset into the ring buffer where the next
- // inbound packet will be placed by the kernel.
- ringOffset int
-
// gsoMaxSize is the maximum GSO packet size. It is zero if GSO is
// disabled.
gsoMaxSize uint32
@@ -174,11 +154,7 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) {
packetDispatchMode: opts.PacketDispatchMode,
}
- // For non-socket FDs we read one packet a time (e.g. TAP devices).
- msgsPerRecv := 1
- e.inboundDispatcher = e.dispatch
-
- isSocket, err := isSocketFD(opts.FD)
+ isSocket, err := isSocketFD(e.fd)
if err != nil {
return 0, err
}
@@ -187,44 +163,41 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) {
e.caps |= stack.CapabilityGSO
e.gsoMaxSize = opts.GSOMaxSize
}
+ }
+ e.inboundDispatcher, err = createInboundDispatcher(e, isSocket)
+ if err != nil {
+ return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err)
+ }
+
+ return stack.RegisterLinkEndpoint(e), nil
+}
+func createInboundDispatcher(e *endpoint, isSocket bool) (linkDispatcher, error) {
+ // By default use the readv() dispatcher as it works with all kinds of
+ // FDs (tap/tun/unix domain sockets and af_packet).
+ inboundDispatcher, err := newReadVDispatcher(e.fd, e)
+ if err != nil {
+ return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", e.fd, e, err)
+ }
+
+ if isSocket {
switch e.packetDispatchMode {
case PacketMMap:
- if err := e.setupPacketRXRing(); err != nil {
- return 0, fmt.Errorf("e.setupPacketRXRing failed: %v", err)
+ inboundDispatcher, err = newPacketMMapDispatcher(e.fd, e)
+ if err != nil {
+ return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", e.fd, e, err)
}
- e.inboundDispatcher = e.packetMMapDispatch
- return stack.RegisterLinkEndpoint(e), nil
-
case RecvMMsg:
// If the provided FD is a socket then we optimize
// packet reads by using recvmmsg() instead of read() to
// read packets in a batch.
- e.inboundDispatcher = e.recvMMsgDispatch
- msgsPerRecv = MaxMsgsPerRecv
+ inboundDispatcher, err = newRecvMMsgDispatcher(e.fd, e)
+ if err != nil {
+ return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", e.fd, e, err)
+ }
}
}
-
- e.views = make([][]buffer.View, msgsPerRecv)
- for i := range e.views {
- e.views[i] = make([]buffer.View, len(BufConfig))
- }
- e.iovecs = make([][]syscall.Iovec, msgsPerRecv)
- iovLen := len(BufConfig)
- if e.Capabilities()&stack.CapabilityGSO != 0 {
- // virtioNetHdr is prepended before each packet.
- iovLen++
- }
- for i := range e.iovecs {
- e.iovecs[i] = make([]syscall.Iovec, iovLen)
- }
- e.msgHdrs = make([]rawfile.MMsgHdr, msgsPerRecv)
- for i := range e.msgHdrs {
- e.msgHdrs[i].Msg.Iov = &e.iovecs[i][0]
- e.msgHdrs[i].Msg.Iovlen = uint64(iovLen)
- }
-
- return stack.RegisterLinkEndpoint(e), nil
+ return inboundDispatcher, nil
}
func isSocketFD(fd int) (bool, error) {
@@ -347,163 +320,11 @@ func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Erro
return rawfile.NonBlockingWrite(e.fd, packet)
}
-func (e *endpoint) capViews(k, n int, buffers []int) int {
- c := 0
- for i, s := range buffers {
- c += s
- if c >= n {
- e.views[k][i].CapLength(s - (c - n))
- return i + 1
- }
- }
- return len(buffers)
-}
-
-func (e *endpoint) allocateViews(bufConfig []int) {
- for k := 0; k < len(e.views); k++ {
- var vnetHdr [virtioNetHdrSize]byte
- vnetHdrOff := 0
- if e.Capabilities()&stack.CapabilityGSO != 0 {
- // The kernel adds virtioNetHdr before each packet, but
- // we don't use it, so so we allocate a buffer for it,
- // add it in iovecs but don't add it in a view.
- e.iovecs[k][0] = syscall.Iovec{
- Base: &vnetHdr[0],
- Len: uint64(virtioNetHdrSize),
- }
- vnetHdrOff++
- }
- for i := 0; i < len(bufConfig); i++ {
- if e.views[k][i] != nil {
- break
- }
- b := buffer.NewView(bufConfig[i])
- e.views[k][i] = b
- e.iovecs[k][i+vnetHdrOff] = syscall.Iovec{
- Base: &b[0],
- Len: uint64(len(b)),
- }
- }
- }
-}
-
-// dispatch reads one packet from the file descriptor and dispatches it.
-func (e *endpoint) dispatch() (bool, *tcpip.Error) {
- e.allocateViews(BufConfig)
-
- n, err := rawfile.BlockingReadv(e.fd, e.iovecs[0])
- if err != nil {
- return false, err
- }
- if e.Capabilities()&stack.CapabilityGSO != 0 {
- // Skip virtioNetHdr which is added before each packet, it
- // isn't used and it isn't in a view.
- n -= virtioNetHdrSize
- }
- if n <= e.hdrSize {
- return false, nil
- }
-
- var (
- p tcpip.NetworkProtocolNumber
- remote, local tcpip.LinkAddress
- )
- if e.hdrSize > 0 {
- eth := header.Ethernet(e.views[0][0])
- p = eth.Type()
- remote = eth.SourceAddress()
- local = eth.DestinationAddress()
- } else {
- // We don't get any indication of what the packet is, so try to guess
- // if it's an IPv4 or IPv6 packet.
- switch header.IPVersion(e.views[0][0]) {
- case header.IPv4Version:
- p = header.IPv4ProtocolNumber
- case header.IPv6Version:
- p = header.IPv6ProtocolNumber
- default:
- return true, nil
- }
- }
-
- used := e.capViews(0, n, BufConfig)
- vv := buffer.NewVectorisedView(n, e.views[0][:used])
- vv.TrimFront(e.hdrSize)
-
- e.dispatcher.DeliverNetworkPacket(e, remote, local, p, vv)
-
- // Prepare e.views for another packet: release used views.
- for i := 0; i < used; i++ {
- e.views[0][i] = nil
- }
-
- return true, nil
-}
-
-// recvMMsgDispatch reads more than one packet at a time from the file
-// descriptor and dispatches it.
-func (e *endpoint) recvMMsgDispatch() (bool, *tcpip.Error) {
- e.allocateViews(BufConfig)
-
- nMsgs, err := rawfile.BlockingRecvMMsg(e.fd, e.msgHdrs)
- if err != nil {
- return false, err
- }
- // Process each of received packets.
- for k := 0; k < nMsgs; k++ {
- n := int(e.msgHdrs[k].Len)
- if e.Capabilities()&stack.CapabilityGSO != 0 {
- n -= virtioNetHdrSize
- }
- if n <= e.hdrSize {
- return false, nil
- }
-
- var (
- p tcpip.NetworkProtocolNumber
- remote, local tcpip.LinkAddress
- )
- if e.hdrSize > 0 {
- eth := header.Ethernet(e.views[k][0])
- p = eth.Type()
- remote = eth.SourceAddress()
- local = eth.DestinationAddress()
- } else {
- // We don't get any indication of what the packet is, so try to guess
- // if it's an IPv4 or IPv6 packet.
- switch header.IPVersion(e.views[k][0]) {
- case header.IPv4Version:
- p = header.IPv4ProtocolNumber
- case header.IPv6Version:
- p = header.IPv6ProtocolNumber
- default:
- return true, nil
- }
- }
-
- used := e.capViews(k, int(n), BufConfig)
- vv := buffer.NewVectorisedView(int(n), e.views[k][:used])
- vv.TrimFront(e.hdrSize)
- e.dispatcher.DeliverNetworkPacket(e, remote, local, p, vv)
-
- // Prepare e.views for another packet: release used views.
- for i := 0; i < used; i++ {
- e.views[k][i] = nil
- }
- }
-
- for k := 0; k < nMsgs; k++ {
- e.msgHdrs[k].Len = 0
- }
-
- return true, nil
-}
-
// dispatchLoop reads packets from the file descriptor in a loop and dispatches
// them to the network stack.
func (e *endpoint) dispatchLoop() *tcpip.Error {
for {
- cont, err := e.inboundDispatcher()
+ cont, err := e.inboundDispatcher.dispatch()
if err != nil || !cont {
if e.closed != nil {
e.closed(err)
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 31138e4ac..fd1722074 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -357,28 +357,6 @@ func TestBufConfigFirst(t *testing.T) {
}
}
-func build(bufConfig []int) *endpoint {
- e := &endpoint{
- views: make([][]buffer.View, MaxMsgsPerRecv),
- iovecs: make([][]syscall.Iovec, MaxMsgsPerRecv),
- msgHdrs: make([]rawfile.MMsgHdr, MaxMsgsPerRecv),
- }
-
- for i, _ := range e.views {
- e.views[i] = make([]buffer.View, len(bufConfig))
- }
- for i := range e.iovecs {
- e.iovecs[i] = make([]syscall.Iovec, len(bufConfig))
- }
- for k, msgHdr := range e.msgHdrs {
- msgHdr.Msg.Iov = &e.iovecs[k][0]
- msgHdr.Msg.Iovlen = uint64(len(bufConfig))
- }
-
- e.allocateViews(bufConfig)
- return e
-}
-
var capLengthTestCases = []struct {
comment string
config []int
@@ -416,19 +394,61 @@ var capLengthTestCases = []struct {
},
}
-func TestCapLength(t *testing.T) {
+func TestReadVDispatcherCapLength(t *testing.T) {
+ for _, c := range capLengthTestCases {
+ // fd does not matter for this test.
+ d := readVDispatcher{fd: -1, e: &endpoint{}}
+ d.views = make([]buffer.View, len(c.config))
+ d.iovecs = make([]syscall.Iovec, len(c.config))
+ d.allocateViews(c.config)
+
+ used := d.capViews(c.n, c.config)
+ if used != c.wantUsed {
+ t.Errorf("Test %q failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed)
+ }
+ lengths := make([]int, len(d.views))
+ for i, v := range d.views {
+ lengths[i] = len(v)
+ }
+ if !reflect.DeepEqual(lengths, c.wantLengths) {
+ t.Errorf("Test %q failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths)
+ }
+ }
+}
+
+func TestRecvMMsgDispatcherCapLength(t *testing.T) {
for _, c := range capLengthTestCases {
- e := build(c.config)
- used := e.capViews(0, c.n, c.config)
+ d := recvMMsgDispatcher{
+ fd: -1, // fd does not matter for this test.
+ e: &endpoint{},
+ views: make([][]buffer.View, 1),
+ iovecs: make([][]syscall.Iovec, 1),
+ msgHdrs: make([]rawfile.MMsgHdr, 1),
+ }
+
+ for i, _ := range d.views {
+ d.views[i] = make([]buffer.View, len(c.config))
+ }
+ for i := range d.iovecs {
+ d.iovecs[i] = make([]syscall.Iovec, len(c.config))
+ }
+ for k, msgHdr := range d.msgHdrs {
+ msgHdr.Msg.Iov = &d.iovecs[k][0]
+ msgHdr.Msg.Iovlen = uint64(len(c.config))
+ }
+
+ d.allocateViews(c.config)
+
+ used := d.capViews(0, c.n, c.config)
if used != c.wantUsed {
- t.Errorf("Test \"%s\" failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed)
+ t.Errorf("Test %q failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed)
}
- lengths := make([]int, len(e.views[0]))
- for i, v := range e.views[0] {
+ lengths := make([]int, len(d.views[0]))
+ for i, v := range d.views[0] {
lengths[i] = len(v)
}
if !reflect.DeepEqual(lengths, c.wantLengths) {
- t.Errorf("Test \"%s\" failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths)
+ t.Errorf("Test %q failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths)
}
}
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index 430c85a42..6b7f2a185 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -18,16 +18,8 @@ package fdbased
import "gvisor.googlesource.com/gvisor/pkg/tcpip"
-// Stubbed out versions for non-linux/non-amd64 platforms.
+// Stubbed out version for non-linux/non-amd64 platforms.
-func (e *endpoint) setupPacketRXRing() error {
- return nil
-}
-
-func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) {
+func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, *tcpip.Error) {
return nil, nil
}
-
-func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) {
- return false, nil
-}
diff --git a/pkg/tcpip/link/fdbased/mmap_amd64.go b/pkg/tcpip/link/fdbased/mmap_amd64.go
new file mode 100644
index 000000000..1c2d8c468
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/mmap_amd64.go
@@ -0,0 +1,194 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux,amd64
+
+package fdbased
+
+import (
+ "encoding/binary"
+ "syscall"
+
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
+)
+
+const (
+ tPacketAlignment = uintptr(16)
+ tpStatusKernel = 0
+ tpStatusUser = 1
+ tpStatusCopy = 2
+ tpStatusLosing = 4
+)
+
+// We overallocate the frame size to accommodate space for the
+// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding.
+//
+// Memory allocated for the ring buffer: tpBlockSize * tpBlockNR = 2 MiB
+//
+// NOTE:
+// Frames need to be aligned at 16 byte boundaries.
+// BlockSize needs to be page aligned.
+//
+// For details see PACKET_MMAP setting constraints in
+// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
+const (
+ tpFrameSize = 65536 + 128
+ tpBlockSize = tpFrameSize * 32
+ tpBlockNR = 1
+ tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize
+)
+
+// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct
+// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>.
+func tPacketAlign(v uintptr) uintptr {
+ return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1))
+}
+
+// tPacketReq is the tpacket_req structure as described in
+// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
+type tPacketReq struct {
+ tpBlockSize uint32
+ tpBlockNR uint32
+ tpFrameSize uint32
+ tpFrameNR uint32
+}
+
+// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h>
+type tPacketHdr []byte
+
+const (
+ tpStatusOffset = 0
+ tpLenOffset = 8
+ tpSnapLenOffset = 12
+ tpMacOffset = 16
+ tpNetOffset = 18
+ tpSecOffset = 20
+ tpUSecOffset = 24
+)
+
+func (t tPacketHdr) tpLen() uint32 {
+ return binary.LittleEndian.Uint32(t[tpLenOffset:])
+}
+
+func (t tPacketHdr) tpSnapLen() uint32 {
+ return binary.LittleEndian.Uint32(t[tpSnapLenOffset:])
+}
+
+func (t tPacketHdr) tpMac() uint16 {
+ return binary.LittleEndian.Uint16(t[tpMacOffset:])
+}
+
+func (t tPacketHdr) tpNet() uint16 {
+ return binary.LittleEndian.Uint16(t[tpNetOffset:])
+}
+
+func (t tPacketHdr) tpSec() uint32 {
+ return binary.LittleEndian.Uint32(t[tpSecOffset:])
+}
+
+func (t tPacketHdr) tpUSec() uint32 {
+ return binary.LittleEndian.Uint32(t[tpUSecOffset:])
+}
+
+func (t tPacketHdr) Payload() []byte {
+ return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()]
+}
+
+// packetMMapDispatcher uses PACKET_RX_RING's to read/dispatch inbound packets.
+// See: mmap_amd64_unsafe.go for implementation details.
+type packetMMapDispatcher struct {
+ // fd is the file descriptor used to send and receive packets.
+ fd int
+
+ // e is the endpoint this dispatcher is attached to.
+ e *endpoint
+
+ // ringBuffer is only used when PacketMMap dispatcher is used and points
+ // to the start of the mmapped PACKET_RX_RING buffer.
+ ringBuffer []byte
+
+ // ringOffset is the current offset into the ring buffer where the next
+ // inbound packet will be placed by the kernel.
+ ringOffset int
+}
+
+func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, *tcpip.Error) {
+ hdr := tPacketHdr(d.ringBuffer[d.ringOffset*tpFrameSize:])
+ for hdr.tpStatus()&tpStatusUser == 0 {
+ event := rawfile.PollEvent{
+ FD: int32(d.fd),
+ Events: unix.POLLIN | unix.POLLERR,
+ }
+ if _, errno := rawfile.BlockingPoll(&event, 1, -1); errno != 0 {
+ if errno == syscall.EINTR {
+ continue
+ }
+ return nil, rawfile.TranslateErrno(errno)
+ }
+ if hdr.tpStatus()&tpStatusCopy != 0 {
+ // This frame is truncated so skip it after flipping the
+ // buffer to the kernel.
+ hdr.setTPStatus(tpStatusKernel)
+ d.ringOffset = (d.ringOffset + 1) % tpFrameNR
+ hdr = (tPacketHdr)(d.ringBuffer[d.ringOffset*tpFrameSize:])
+ continue
+ }
+ }
+
+ // Copy out the packet from the mmapped frame to a locally owned buffer.
+ pkt := make([]byte, hdr.tpSnapLen())
+ copy(pkt, hdr.Payload())
+ // Release packet to kernel.
+ hdr.setTPStatus(tpStatusKernel)
+ d.ringOffset = (d.ringOffset + 1) % tpFrameNR
+ return pkt, nil
+}
+
+// dispatch reads packets from an mmaped ring buffer and dispatches them to the
+// network stack.
+func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
+ pkt, err := d.readMMappedPacket()
+ if err != nil {
+ return false, err
+ }
+ var (
+ p tcpip.NetworkProtocolNumber
+ remote, local tcpip.LinkAddress
+ )
+ if d.e.hdrSize > 0 {
+ eth := header.Ethernet(pkt)
+ p = eth.Type()
+ remote = eth.SourceAddress()
+ local = eth.DestinationAddress()
+ } else {
+ // We don't get any indication of what the packet is, so try to guess
+ // if it's an IPv4 or IPv6 packet.
+ switch header.IPVersion(pkt) {
+ case header.IPv4Version:
+ p = header.IPv4ProtocolNumber
+ case header.IPv6Version:
+ p = header.IPv6ProtocolNumber
+ default:
+ return true, nil
+ }
+ }
+
+ pkt = pkt[d.e.hdrSize:]
+ d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}))
+ return true, nil
+}
diff --git a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
index 135da2498..47cb1d1cc 100644
--- a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
+++ b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
@@ -17,76 +17,17 @@
package fdbased
import (
- "encoding/binary"
"fmt"
"sync/atomic"
"syscall"
"unsafe"
"golang.org/x/sys/unix"
- "gvisor.googlesource.com/gvisor/pkg/tcpip"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/header"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
)
-const (
- tPacketAlignment = uintptr(16)
- tpStatusKernel = 0
- tpStatusUser = 1
- tpStatusCopy = 2
- tpStatusLosing = 4
-)
-
-// We overallocate the frame size to accommodate space for the
-// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding.
-//
-// Memory allocated for the ring buffer: tpBlockSize * tpBlockNR = 2 MiB
-//
-// NOTE:
-// Frames need to be aligned at 16 byte boundaries.
-// BlockSize needs to be page aligned.
-//
-// For details see PACKET_MMAP setting constraints in
-// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
-const (
- tpFrameSize = 65536 + 128
- tpBlockSize = tpFrameSize * 32
- tpBlockNR = 1
- tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize
-)
-
-// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct
-// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>.
-func tPacketAlign(v uintptr) uintptr {
- return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1))
-}
-
// tPacketHdrlen is the TPACKET_HDRLEN variable defined in <linux/if_packet.h>.
var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(syscall.RawSockaddrLinklayer{}))
-// tPacketReq is the tpacket_req structure as described in
-// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
-type tPacketReq struct {
- tpBlockSize uint32
- tpBlockNR uint32
- tpFrameSize uint32
- tpFrameNR uint32
-}
-
-// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h>
-type tPacketHdr []byte
-
-const (
- tpStatusOffset = 0
- tpLenOffset = 8
- tpSnapLenOffset = 12
- tpMacOffset = 16
- tpNetOffset = 18
- tpSecOffset = 20
- tpUSecOffset = 24
-)
-
// tpStatus returns the frame status field.
// The status is concurrently updated by the kernel as a result we must
// use atomic operations to prevent races.
@@ -105,38 +46,14 @@ func (t tPacketHdr) setTPStatus(status uint32) {
atomic.StoreUint32((*uint32)(statusPtr), status)
}
-func (t tPacketHdr) tpLen() uint32 {
- return binary.LittleEndian.Uint32(t[tpLenOffset:])
-}
-
-func (t tPacketHdr) tpSnapLen() uint32 {
- return binary.LittleEndian.Uint32(t[tpSnapLenOffset:])
-}
-
-func (t tPacketHdr) tpMac() uint16 {
- return binary.LittleEndian.Uint16(t[tpMacOffset:])
-}
-
-func (t tPacketHdr) tpNet() uint16 {
- return binary.LittleEndian.Uint16(t[tpNetOffset:])
-}
-
-func (t tPacketHdr) tpSec() uint32 {
- return binary.LittleEndian.Uint32(t[tpSecOffset:])
-}
-
-func (t tPacketHdr) tpUSec() uint32 {
- return binary.LittleEndian.Uint32(t[tpUSecOffset:])
-}
-
-func (t tPacketHdr) Payload() []byte {
- return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()]
-}
-
-func (e *endpoint) setupPacketRXRing() error {
+func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
+ d := &packetMMapDispatcher{
+ fd: fd,
+ e: e,
+ }
pageSize := unix.Getpagesize()
if tpBlockSize%pageSize != 0 {
- return fmt.Errorf("tpBlockSize: %d is not page aligned, pagesize: %d", tpBlockSize, pageSize)
+ return nil, fmt.Errorf("tpBlockSize: %d is not page aligned, pagesize: %d", tpBlockSize, pageSize)
}
tReq := tPacketReq{
tpBlockSize: uint32(tpBlockSize),
@@ -145,84 +62,17 @@ func (e *endpoint) setupPacketRXRing() error {
tpFrameNR: uint32(tpFrameNR),
}
// Setup PACKET_RX_RING.
- if err := setsockopt(e.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil {
- return fmt.Errorf("failed to enable PACKET_RX_RING: %v", err)
+ if err := setsockopt(d.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil {
+ return nil, fmt.Errorf("failed to enable PACKET_RX_RING: %v", err)
}
// Let's mmap the blocks.
sz := tpBlockSize * tpBlockNR
- buf, err := syscall.Mmap(e.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
- if err != nil {
- return fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err)
- }
- e.ringBuffer = buf
- return nil
-}
-
-func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) {
- hdr := (tPacketHdr)(e.ringBuffer[e.ringOffset*tpFrameSize:])
- for hdr.tpStatus()&tpStatusUser == 0 {
- event := rawfile.PollEvent{
- FD: int32(e.fd),
- Events: unix.POLLIN | unix.POLLERR,
- }
- _, errno := rawfile.BlockingPoll(&event, 1, -1)
- if errno != 0 {
- if errno == syscall.EINTR {
- continue
- }
- return nil, rawfile.TranslateErrno(errno)
- }
- if hdr.tpStatus()&tpStatusCopy != 0 {
- // This frame is truncated so skip it after flipping the
- // buffer to the kernel.
- hdr.setTPStatus(tpStatusKernel)
- e.ringOffset = (e.ringOffset + 1) % tpFrameNR
- hdr = (tPacketHdr)(e.ringBuffer[e.ringOffset*tpFrameSize:])
- continue
- }
- }
-
- // Copy out the packet from the mmapped frame to a locally owned buffer.
- pkt := make([]byte, hdr.tpSnapLen())
- copy(pkt, hdr.Payload())
- // Release packet to kernel.
- hdr.setTPStatus(tpStatusKernel)
- e.ringOffset = (e.ringOffset + 1) % tpFrameNR
- return pkt, nil
-}
-
-// packetMMapDispatch reads packets from an mmaped ring buffer and dispatches
-// them to the network stack.
-func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) {
- pkt, err := e.readMMappedPacket()
+ buf, err := syscall.Mmap(d.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
if err != nil {
- return false, err
+ return nil, fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err)
}
- var (
- p tcpip.NetworkProtocolNumber
- remote, local tcpip.LinkAddress
- )
- if e.hdrSize > 0 {
- eth := header.Ethernet(pkt)
- p = eth.Type()
- remote = eth.SourceAddress()
- local = eth.DestinationAddress()
- } else {
- // We don't get any indication of what the packet is, so try to guess
- // if it's an IPv4 or IPv6 packet.
- switch header.IPVersion(pkt) {
- case header.IPv4Version:
- p = header.IPv4ProtocolNumber
- case header.IPv6Version:
- p = header.IPv6ProtocolNumber
- default:
- return true, nil
- }
- }
-
- pkt = pkt[e.hdrSize:]
- e.dispatcher.DeliverNetworkPacket(e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}))
- return true, nil
+ d.ringBuffer = buf
+ return d, nil
}
func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error {
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
new file mode 100644
index 000000000..1ae0e3359
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -0,0 +1,309 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package fdbased
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+)
+
+// BufConfig defines the shape of the vectorised view used to read packets from the NIC.
+var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
+
+// readVDispatcher uses readv() system call to read inbound packets and
+// dispatches them.
+type readVDispatcher struct {
+ // fd is the file descriptor used to send and receive packets.
+ fd int
+
+ // e is the endpoint this dispatcher is attached to.
+ e *endpoint
+
+ // views are the actual buffers that hold the packet contents.
+ views []buffer.View
+
+ // iovecs are initialized with base pointers/len of the corresponding
+ // entries in the views defined above, except when GSO is enabled then
+ // the first iovec points to a buffer for the vnet header which is
+ // stripped before the views are passed up the stack for further
+ // processing.
+ iovecs []syscall.Iovec
+}
+
+func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
+ d := &readVDispatcher{fd: fd, e: e}
+ d.views = make([]buffer.View, len(BufConfig))
+ iovLen := len(BufConfig)
+ if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ iovLen++
+ }
+ d.iovecs = make([]syscall.Iovec, iovLen)
+ return d, nil
+}
+
+func (d *readVDispatcher) allocateViews(bufConfig []int) {
+ var vnetHdr [virtioNetHdrSize]byte
+ vnetHdrOff := 0
+ if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ // The kernel adds virtioNetHdr before each packet, but
+ // we don't use it, so so we allocate a buffer for it,
+ // add it in iovecs but don't add it in a view.
+ d.iovecs[0] = syscall.Iovec{
+ Base: &vnetHdr[0],
+ Len: uint64(virtioNetHdrSize),
+ }
+ vnetHdrOff++
+ }
+ for i := 0; i < len(bufConfig); i++ {
+ if d.views[i] != nil {
+ break
+ }
+ b := buffer.NewView(bufConfig[i])
+ d.views[i] = b
+ d.iovecs[i+vnetHdrOff] = syscall.Iovec{
+ Base: &b[0],
+ Len: uint64(len(b)),
+ }
+ }
+}
+
+func (d *readVDispatcher) capViews(n int, buffers []int) int {
+ c := 0
+ for i, s := range buffers {
+ c += s
+ if c >= n {
+ d.views[i].CapLength(s - (c - n))
+ return i + 1
+ }
+ }
+ return len(buffers)
+}
+
+// dispatch reads one packet from the file descriptor and dispatches it.
+func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
+ d.allocateViews(BufConfig)
+
+ n, err := rawfile.BlockingReadv(d.fd, d.iovecs)
+ if err != nil {
+ return false, err
+ }
+ if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ // Skip virtioNetHdr which is added before each packet, it
+ // isn't used and it isn't in a view.
+ n -= virtioNetHdrSize
+ }
+ if n <= d.e.hdrSize {
+ return false, nil
+ }
+
+ var (
+ p tcpip.NetworkProtocolNumber
+ remote, local tcpip.LinkAddress
+ )
+ if d.e.hdrSize > 0 {
+ eth := header.Ethernet(d.views[0])
+ p = eth.Type()
+ remote = eth.SourceAddress()
+ local = eth.DestinationAddress()
+ } else {
+ // We don't get any indication of what the packet is, so try to guess
+ // if it's an IPv4 or IPv6 packet.
+ switch header.IPVersion(d.views[0]) {
+ case header.IPv4Version:
+ p = header.IPv4ProtocolNumber
+ case header.IPv6Version:
+ p = header.IPv6ProtocolNumber
+ default:
+ return true, nil
+ }
+ }
+
+ used := d.capViews(n, BufConfig)
+ vv := buffer.NewVectorisedView(n, d.views[:used])
+ vv.TrimFront(d.e.hdrSize)
+
+ d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv)
+
+ // Prepare e.views for another packet: release used views.
+ for i := 0; i < used; i++ {
+ d.views[i] = nil
+ }
+
+ return true, nil
+}
+
+// recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and
+// dispatches them.
+type recvMMsgDispatcher struct {
+ // fd is the file descriptor used to send and receive packets.
+ fd int
+
+ // e is the endpoint this dispatcher is attached to.
+ e *endpoint
+
+ // views is an array of array of buffers that contain packet contents.
+ views [][]buffer.View
+
+ // iovecs is an array of array of iovec records where each iovec base
+ // pointer and length are initialzed to the corresponding view above,
+ // except when GSO is neabled then the first iovec in each array of
+ // iovecs points to a buffer for the vnet header which is stripped
+ // before the views are passed up the stack for further processing.
+ iovecs [][]syscall.Iovec
+
+ // msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to
+ // reference an array of iovecs in the iovecs field defined above. This
+ // array is passed as the parameter to recvmmsg call to retrieve
+ // potentially more than 1 packet per syscall.
+ msgHdrs []rawfile.MMsgHdr
+}
+
+const (
+ // MaxMsgsPerRecv is the maximum number of packets we want to retrieve
+ // in a single RecvMMsg call.
+ MaxMsgsPerRecv = 8
+)
+
+func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
+ d := &recvMMsgDispatcher{
+ fd: fd,
+ e: e,
+ }
+ d.views = make([][]buffer.View, MaxMsgsPerRecv)
+ for i := range d.views {
+ d.views[i] = make([]buffer.View, len(BufConfig))
+ }
+ d.iovecs = make([][]syscall.Iovec, MaxMsgsPerRecv)
+ iovLen := len(BufConfig)
+ if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ // virtioNetHdr is prepended before each packet.
+ iovLen++
+ }
+ for i := range d.iovecs {
+ d.iovecs[i] = make([]syscall.Iovec, iovLen)
+ }
+ d.msgHdrs = make([]rawfile.MMsgHdr, MaxMsgsPerRecv)
+ for i := range d.msgHdrs {
+ d.msgHdrs[i].Msg.Iov = &d.iovecs[i][0]
+ d.msgHdrs[i].Msg.Iovlen = uint64(iovLen)
+ }
+ return d, nil
+}
+
+func (d *recvMMsgDispatcher) capViews(k, n int, buffers []int) int {
+ c := 0
+ for i, s := range buffers {
+ c += s
+ if c >= n {
+ d.views[k][i].CapLength(s - (c - n))
+ return i + 1
+ }
+ }
+ return len(buffers)
+}
+
+func (d *recvMMsgDispatcher) allocateViews(bufConfig []int) {
+ for k := 0; k < len(d.views); k++ {
+ var vnetHdr [virtioNetHdrSize]byte
+ vnetHdrOff := 0
+ if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ // The kernel adds virtioNetHdr before each packet, but
+ // we don't use it, so so we allocate a buffer for it,
+ // add it in iovecs but don't add it in a view.
+ d.iovecs[k][0] = syscall.Iovec{
+ Base: &vnetHdr[0],
+ Len: uint64(virtioNetHdrSize),
+ }
+ vnetHdrOff++
+ }
+ for i := 0; i < len(bufConfig); i++ {
+ if d.views[k][i] != nil {
+ break
+ }
+ b := buffer.NewView(bufConfig[i])
+ d.views[k][i] = b
+ d.iovecs[k][i+vnetHdrOff] = syscall.Iovec{
+ Base: &b[0],
+ Len: uint64(len(b)),
+ }
+ }
+ }
+}
+
+// recvMMsgDispatch reads more than one packet at a time from the file
+// descriptor and dispatches it.
+func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
+ d.allocateViews(BufConfig)
+
+ nMsgs, err := rawfile.BlockingRecvMMsg(d.fd, d.msgHdrs)
+ if err != nil {
+ return false, err
+ }
+ // Process each of received packets.
+ for k := 0; k < nMsgs; k++ {
+ n := int(d.msgHdrs[k].Len)
+ if d.e.Capabilities()&stack.CapabilityGSO != 0 {
+ n -= virtioNetHdrSize
+ }
+ if n <= d.e.hdrSize {
+ return false, nil
+ }
+
+ var (
+ p tcpip.NetworkProtocolNumber
+ remote, local tcpip.LinkAddress
+ )
+ if d.e.hdrSize > 0 {
+ eth := header.Ethernet(d.views[k][0])
+ p = eth.Type()
+ remote = eth.SourceAddress()
+ local = eth.DestinationAddress()
+ } else {
+ // We don't get any indication of what the packet is, so try to guess
+ // if it's an IPv4 or IPv6 packet.
+ switch header.IPVersion(d.views[k][0]) {
+ case header.IPv4Version:
+ p = header.IPv4ProtocolNumber
+ case header.IPv6Version:
+ p = header.IPv6ProtocolNumber
+ default:
+ return true, nil
+ }
+ }
+
+ used := d.capViews(k, int(n), BufConfig)
+ vv := buffer.NewVectorisedView(int(n), d.views[k][:used])
+ vv.TrimFront(d.e.hdrSize)
+ d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv)
+
+ // Prepare e.views for another packet: release used views.
+ for i := 0; i < used; i++ {
+ d.views[k][i] = nil
+ }
+ }
+
+ for k := 0; k < nMsgs; k++ {
+ d.msgHdrs[k].Len = 0
+ }
+
+ return true, nil
+}