summaryrefslogtreecommitdiffhomepage
path: root/pkg
diff options
context:
space:
mode:
Diffstat (limited to 'pkg')
-rw-r--r--pkg/tcpip/link/fdbased/BUILD7
-rw-r--r--pkg/tcpip/link/fdbased/endpoint.go100
-rw-r--r--pkg/tcpip/link/fdbased/mmap.go33
-rw-r--r--pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go210
-rw-r--r--pkg/tcpip/link/rawfile/blockingpoll_amd64.s6
-rw-r--r--pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go2
-rw-r--r--pkg/tcpip/link/rawfile/blockingpoll_unsafe.go4
-rw-r--r--pkg/tcpip/link/rawfile/rawfile_unsafe.go33
8 files changed, 343 insertions, 52 deletions
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 0d78c9b15..bcf9c023e 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -4,7 +4,11 @@ package(licenses = ["notice"])
go_library(
name = "fdbased",
- srcs = ["endpoint.go"],
+ srcs = [
+ "endpoint.go",
+ "mmap.go",
+ "mmap_amd64_unsafe.go",
+ ],
importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased",
visibility = [
"//visibility:public",
@@ -15,6 +19,7 @@ go_library(
"//pkg/tcpip/header",
"//pkg/tcpip/link/rawfile",
"//pkg/tcpip/stack",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 87c8ab1fc..20f379ab0 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -47,6 +47,30 @@ var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
// NetworkDispatcher.
type linkDispatcher func() (bool, *tcpip.Error)
+// PacketDispatchMode are the various supported methods of receiving and
+// dispatching packets from the underlying FD.
+type PacketDispatchMode int
+
+const (
+ // Readv is the default dispatch mode and is the least performant of the
+ // dispatch options but the one that is supported by all underlying FD
+ // types.
+ Readv PacketDispatchMode = iota
+ // RecvMMsg enables use of recvmmsg() syscall instead of readv() to
+ // read inbound packets. This reduces # of syscalls needed to process
+ // packets.
+ //
+ // NOTE: recvmmsg() is only supported for sockets, so if the underlying
+ // FD is not a socket then the code will still fall back to the readv()
+ // path.
+ RecvMMsg
+ // PacketMMap enables use of PACKET_RX_RING to receive packets from the
+ // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The
+ // primary use-case for this is runsc which uses an AF_PACKET FD to
+ // receive packets from the veth device.
+ PacketMMap
+)
+
type endpoint struct {
// fd is the file descriptor used to send and receive packets.
fd int
@@ -68,9 +92,11 @@ type endpoint struct {
// its end of the communication pipe.
closed func(*tcpip.Error)
- views [][]buffer.View
- iovecs [][]syscall.Iovec
- msgHdrs []rawfile.MMsgHdr
+ views [][]buffer.View
+ iovecs [][]syscall.Iovec
+ // msgHdrs is only used by the RecvMMsg dispatcher.
+ msgHdrs []rawfile.MMsgHdr
+
inboundDispatcher linkDispatcher
dispatcher stack.NetworkDispatcher
@@ -79,28 +105,31 @@ type endpoint struct {
// endpoint (false).
handleLocal bool
- // useRecvMMsg enables use of recvmmsg() syscall instead of readv() to
- // read inbound packets. This reduces # of syscalls needed to process
- // packets.
- //
- // NOTE: recvmmsg() is only supported for sockets, so if the underlying
- // FD is not a socket then the code will still fall back to the readv()
- // path.
- useRecvMMsg bool
+ // packetDispatchMode controls the packet dispatcher used by this
+ // endpoint.
+ packetDispatchMode PacketDispatchMode
+
+ // ringBuffer is only used when PacketMMap dispatcher is used and points
+ // to the start of the mmapped PACKET_RX_RING buffer.
+ ringBuffer []byte
+
+ // ringOffset is the current offset into the ring buffer where the next
+ // inbound packet will be placed by the kernel.
+ ringOffset int
}
// Options specify the details about the fd-based endpoint to be created.
type Options struct {
- FD int
- MTU uint32
- EthernetHeader bool
- ChecksumOffload bool
- ClosedFunc func(*tcpip.Error)
- Address tcpip.LinkAddress
- SaveRestore bool
- DisconnectOk bool
- HandleLocal bool
- UseRecvMMsg bool
+ FD int
+ MTU uint32
+ EthernetHeader bool
+ ChecksumOffload bool
+ ClosedFunc func(*tcpip.Error)
+ Address tcpip.LinkAddress
+ SaveRestore bool
+ DisconnectOk bool
+ HandleLocal bool
+ PacketDispatchMode PacketDispatchMode
}
// New creates a new fd-based endpoint.
@@ -133,21 +162,31 @@ func New(opts *Options) tcpip.LinkEndpointID {
}
e := &endpoint{
- fd: opts.FD,
- mtu: opts.MTU,
- caps: caps,
- closed: opts.ClosedFunc,
- addr: opts.Address,
- hdrSize: hdrSize,
- handleLocal: opts.HandleLocal,
- useRecvMMsg: opts.UseRecvMMsg,
+ fd: opts.FD,
+ mtu: opts.MTU,
+ caps: caps,
+ closed: opts.ClosedFunc,
+ addr: opts.Address,
+ hdrSize: hdrSize,
+ handleLocal: opts.HandleLocal,
+ packetDispatchMode: opts.PacketDispatchMode,
+ }
+
+ if isSocketFD(opts.FD) && e.packetDispatchMode == PacketMMap {
+ if err := e.setupPacketRXRing(); err != nil {
+ // TODO: replace panic with an error return.
+ panic(fmt.Sprintf("e.setupPacketRXRing failed: %v", err))
+ }
+ e.inboundDispatcher = e.packetMMapDispatch
+ return stack.RegisterLinkEndpoint(e)
}
+
// For non-socket FDs we read one packet a time (e.g. TAP devices)
msgsPerRecv := 1
e.inboundDispatcher = e.dispatch
// If the provided FD is a socket then we optimize packet reads by
// using recvmmsg() instead of read() to read packets in a batch.
- if isSocketFD(opts.FD) && e.useRecvMMsg {
+ if isSocketFD(opts.FD) && e.packetDispatchMode == RecvMMsg {
e.inboundDispatcher = e.recvMMsgDispatch
msgsPerRecv = MaxMsgsPerRecv
}
@@ -165,6 +204,7 @@ func New(opts *Options) tcpip.LinkEndpointID {
e.msgHdrs[i].Msg.Iov = &e.iovecs[i][0]
e.msgHdrs[i].Msg.Iovlen = uint64(len(BufConfig))
}
+
return stack.RegisterLinkEndpoint(e)
}
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
new file mode 100644
index 000000000..f1e71c233
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -0,0 +1,33 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !linux !amd64
+
+package fdbased
+
+import "gvisor.googlesource.com/gvisor/pkg/tcpip"
+
+// Stubbed out versions for non-linux/non-amd64 platforms.
+
+func (e *endpoint) setupPacketRXRing() error {
+ return nil
+}
+
+func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) {
+ return nil, nil
+}
+
+func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) {
+ return false, nil
+}
diff --git a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
new file mode 100644
index 000000000..d88c3f8a5
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
@@ -0,0 +1,210 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux,amd64
+
+package fdbased
+
+import (
+ "encoding/binary"
+ "fmt"
+ "syscall"
+ "unsafe"
+
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
+)
+
+const (
+ tPacketAlignment = uintptr(16)
+ tpStatusKernel = 0
+ tpStatusUser = 1
+ tpStatusCopy = 2
+ tpStatusLosing = 4
+)
+
+// We overallocate the frame size to accommodate space for the
+// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding.
+//
+// NOTE: Frames need to be aligned at 16 byte boundaries.
+const (
+ tpFrameSize = 65536 + 128
+ tpBlockSize = tpFrameSize * 128
+ tpBlockNR = 10
+ tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize
+)
+
+// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct
+// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>.
+func tPacketAlign(v uintptr) uintptr {
+ return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1))
+}
+
+// tPacketHdrlen is the TPACKET_HDRLEN variable defined in <linux/if_packet.h>.
+var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(syscall.RawSockaddrLinklayer{}))
+
+// tPacketReq is the tpacket_req structure as described in
+// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
+type tPacketReq struct {
+ tpBlockSize uint32
+ tpBlockNR uint32
+ tpFrameSize uint32
+ tpFrameNR uint32
+}
+
+// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h>
+type tPacketHdr []byte
+
+const (
+ tpStatusOffset = 0
+ tpLenOffset = 8
+ tpSnapLenOffset = 12
+ tpMacOffset = 16
+ tpNetOffset = 18
+ tpSecOffset = 20
+ tpUSecOffset = 24
+)
+
+func (t tPacketHdr) tpStatus() uint32 {
+ return binary.LittleEndian.Uint32(t[tpStatusOffset:])
+}
+
+func (t tPacketHdr) setTPStatus(status uint32) {
+ binary.LittleEndian.PutUint32(t[tpStatusOffset:], status)
+}
+
+func (t tPacketHdr) tpLen() uint32 {
+ return binary.LittleEndian.Uint32(t[tpLenOffset:])
+}
+
+func (t tPacketHdr) tpSnapLen() uint32 {
+ return binary.LittleEndian.Uint32(t[tpSnapLenOffset:])
+}
+
+func (t tPacketHdr) tpMac() uint16 {
+ return binary.LittleEndian.Uint16(t[tpMacOffset:])
+}
+
+func (t tPacketHdr) tpNet() uint16 {
+ return binary.LittleEndian.Uint16(t[tpNetOffset:])
+}
+
+func (t tPacketHdr) tpSec() uint32 {
+ return binary.LittleEndian.Uint32(t[tpSecOffset:])
+}
+
+func (t tPacketHdr) tpUSec() uint32 {
+ return binary.LittleEndian.Uint32(t[tpUSecOffset:])
+}
+
+func (t tPacketHdr) Payload() []byte {
+ return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()]
+}
+
+func (e *endpoint) setupPacketRXRing() error {
+ tReq := tPacketReq{
+ tpBlockSize: uint32(tpBlockSize),
+ tpBlockNR: uint32(tpBlockNR),
+ tpFrameSize: uint32(tpFrameSize),
+ tpFrameNR: uint32(tpFrameNR),
+ }
+ // Setup PACKET_RX_RING.
+ if err := setsockopt(e.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil {
+ return fmt.Errorf("failed to enable PACKET_RX_RING: %v", err)
+ }
+ // Let's mmap the blocks.
+ sz := tpBlockSize * tpBlockNR
+ buf, err := syscall.Mmap(e.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
+ if err != nil {
+ return fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err)
+ }
+ e.ringBuffer = buf
+ return nil
+}
+
+func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) {
+ hdr := (tPacketHdr)(e.ringBuffer[0+e.ringOffset*tpFrameSize:])
+ for (hdr.tpStatus() & tpStatusUser) == 0 {
+ event := rawfile.PollEvent{
+ FD: int32(e.fd),
+ Events: unix.POLLIN | unix.POLLERR,
+ }
+ _, errno := rawfile.BlockingPoll(&event, 1, -1)
+ if errno != 0 {
+ if errno == syscall.EINTR {
+ continue
+ }
+ return nil, rawfile.TranslateErrno(errno)
+ }
+ if hdr.tpStatus()&tpStatusCopy != 0 {
+ continue
+ }
+ if hdr.tpStatus()&tpStatusLosing != 0 {
+ continue
+ }
+ }
+
+ // Copy out the packet from the mmapped frame to a locally owned buffer.
+ pkt := make([]byte, hdr.tpSnapLen())
+ copy(pkt, hdr.Payload())
+ // Release packet to kernel.
+ hdr.setTPStatus(tpStatusKernel)
+ e.ringOffset = (e.ringOffset + 1) % tpFrameNR
+ return pkt, nil
+}
+
+// packetMMapDispatch reads packets from an mmaped ring buffer and dispatches
+// them to the network stack.
+func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) {
+ pkt, err := e.readMMappedPacket()
+ if err != nil {
+ return false, err
+ }
+ var (
+ p tcpip.NetworkProtocolNumber
+ remote, local tcpip.LinkAddress
+ )
+ if e.hdrSize > 0 {
+ eth := header.Ethernet(pkt)
+ p = eth.Type()
+ remote = eth.SourceAddress()
+ local = eth.DestinationAddress()
+ } else {
+ // We don't get any indication of what the packet is, so try to guess
+ // if it's an IPv4 or IPv6 packet.
+ switch header.IPVersion(pkt) {
+ case header.IPv4Version:
+ p = header.IPv4ProtocolNumber
+ case header.IPv6Version:
+ p = header.IPv6ProtocolNumber
+ default:
+ return true, nil
+ }
+ }
+
+ pkt = pkt[e.hdrSize:]
+ e.dispatcher.DeliverNetworkPacket(e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)}))
+ return true, nil
+}
+
+func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error {
+ if _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(val), vallen, 0); errno != 0 {
+ return error(errno)
+ }
+
+ return nil
+}
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
index 8e22ba661..9dade5421 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
@@ -14,12 +14,12 @@
#include "textflag.h"
-// blockingPoll makes the poll() syscall while calling the version of
+// BlockingPoll makes the poll() syscall while calling the version of
// entersyscall that relinquishes the P so that other Gs can run. This is meant
// to be called in cases when the syscall is expected to block.
//
-// func blockingPoll(fds *pollEvent, nfds int, timeout int64) (n int, err syscall.Errno)
-TEXT ·blockingPoll(SB),NOSPLIT,$0-40
+// func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (n int, err syscall.Errno)
+TEXT ·BlockingPoll(SB),NOSPLIT,$0-40
CALL ·callEntersyscallblock(SB)
MOVQ fds+0(FP), DI
MOVQ nfds+8(FP), SI
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
index 93479cd0d..3ba96a123 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
@@ -25,7 +25,7 @@ import (
)
//go:noescape
-func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno)
+func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno)
// Use go:linkname to call into the runtime. As of Go 1.12 this has to
// be done from Go code so that we make an ABIInternal call to an
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
index 6a3e956ad..94ddad8ea 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
@@ -21,7 +21,9 @@ import (
"unsafe"
)
-func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno) {
+// BlockingPoll is just a stub function that forwards to the poll() system call
+// on non-amd64 platforms.
+func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno) {
n, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout))
return int(n), e
}
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index 5deea093a..5d36ebe57 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -94,10 +94,11 @@ func NonBlockingWrite2(fd int, b1, b2 []byte) *tcpip.Error {
return nil
}
-type pollEvent struct {
- fd int32
- events int16
- revents int16
+// PollEvent represents the pollfd structure passed to a poll() system call.
+type PollEvent struct {
+ FD int32
+ Events int16
+ Revents int16
}
// BlockingRead reads from a file descriptor that is set up as non-blocking. If
@@ -110,12 +111,12 @@ func BlockingRead(fd int, b []byte) (int, *tcpip.Error) {
return int(n), nil
}
- event := pollEvent{
- fd: int32(fd),
- events: 1, // POLLIN
+ event := PollEvent{
+ FD: int32(fd),
+ Events: 1, // POLLIN
}
- _, e = blockingPoll(&event, 1, -1)
+ _, e = BlockingPoll(&event, 1, -1)
if e != 0 && e != syscall.EINTR {
return 0, TranslateErrno(e)
}
@@ -132,12 +133,12 @@ func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) {
return int(n), nil
}
- event := pollEvent{
- fd: int32(fd),
- events: 1, // POLLIN
+ event := PollEvent{
+ FD: int32(fd),
+ Events: 1, // POLLIN
}
- _, e = blockingPoll(&event, 1, -1)
+ _, e = BlockingPoll(&event, 1, -1)
if e != 0 && e != syscall.EINTR {
return 0, TranslateErrno(e)
}
@@ -162,12 +163,12 @@ func BlockingRecvMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) {
return int(n), nil
}
- event := pollEvent{
- fd: int32(fd),
- events: 1, // POLLIN
+ event := PollEvent{
+ FD: int32(fd),
+ Events: 1, // POLLIN
}
- if _, e := blockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR {
+ if _, e := BlockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR {
return 0, TranslateErrno(e)
}
}