summaryrefslogtreecommitdiffhomepage
path: root/pkg/tcpip/link/fdbased/endpoint.go
diff options
context:
space:
mode:
authorBhasker Hariharan <bhaskerh@google.com>2019-02-13 14:52:06 -0800
committerShentubot <shentubot@google.com>2019-02-13 14:53:03 -0800
commite0b3d3323fbb4b27280f0087427bb04c3e71238b (patch)
tree4ad81ee05445e0f11a9c1c6fe1227bf49ecbbbab /pkg/tcpip/link/fdbased/endpoint.go
parent0e84ae72e086c77cea066000a898b7bc951ba790 (diff)
Add support for using PACKET_RX_RING to receive packets.
PACKET_RX_RING allows the use of an mmapped buffer to receive packets from the kernel. This should cut down the number of host syscalls that need to be made to receive packets when the underlying fd is a socket of the AF_PACKET type. PiperOrigin-RevId: 233834998 Change-Id: I8060025c6ced206986e94cc46b8f382b81bfa47f
Diffstat (limited to 'pkg/tcpip/link/fdbased/endpoint.go')
-rw-r--r--pkg/tcpip/link/fdbased/endpoint.go100
1 files changed, 70 insertions, 30 deletions
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 87c8ab1fc..20f379ab0 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -47,6 +47,30 @@ var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
// NetworkDispatcher.
type linkDispatcher func() (bool, *tcpip.Error)
+// PacketDispatchMode are the various supported methods of receiving and
+// dispatching packets from the underlying FD.
+type PacketDispatchMode int
+
+const (
+ // Readv is the default dispatch mode and is the least performant of the
+ // dispatch options but the one that is supported by all underlying FD
+ // types.
+ Readv PacketDispatchMode = iota
+ // RecvMMsg enables use of recvmmsg() syscall instead of readv() to
+ // read inbound packets. This reduces # of syscalls needed to process
+ // packets.
+ //
+ // NOTE: recvmmsg() is only supported for sockets, so if the underlying
+ // FD is not a socket then the code will still fall back to the readv()
+ // path.
+ RecvMMsg
+ // PacketMMap enables use of PACKET_RX_RING to receive packets from the
+ // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The
+ // primary use-case for this is runsc which uses an AF_PACKET FD to
+ // receive packets from the veth device.
+ PacketMMap
+)
+
type endpoint struct {
// fd is the file descriptor used to send and receive packets.
fd int
@@ -68,9 +92,11 @@ type endpoint struct {
// its end of the communication pipe.
closed func(*tcpip.Error)
- views [][]buffer.View
- iovecs [][]syscall.Iovec
- msgHdrs []rawfile.MMsgHdr
+ views [][]buffer.View
+ iovecs [][]syscall.Iovec
+ // msgHdrs is only used by the RecvMMsg dispatcher.
+ msgHdrs []rawfile.MMsgHdr
+
inboundDispatcher linkDispatcher
dispatcher stack.NetworkDispatcher
@@ -79,28 +105,31 @@ type endpoint struct {
// endpoint (false).
handleLocal bool
- // useRecvMMsg enables use of recvmmsg() syscall instead of readv() to
- // read inbound packets. This reduces # of syscalls needed to process
- // packets.
- //
- // NOTE: recvmmsg() is only supported for sockets, so if the underlying
- // FD is not a socket then the code will still fall back to the readv()
- // path.
- useRecvMMsg bool
+ // packetDispatchMode controls the packet dispatcher used by this
+ // endpoint.
+ packetDispatchMode PacketDispatchMode
+
+ // ringBuffer is only used when PacketMMap dispatcher is used and points
+ // to the start of the mmapped PACKET_RX_RING buffer.
+ ringBuffer []byte
+
+ // ringOffset is the current offset into the ring buffer where the next
+ // inbound packet will be placed by the kernel.
+ ringOffset int
}
// Options specify the details about the fd-based endpoint to be created.
type Options struct {
- FD int
- MTU uint32
- EthernetHeader bool
- ChecksumOffload bool
- ClosedFunc func(*tcpip.Error)
- Address tcpip.LinkAddress
- SaveRestore bool
- DisconnectOk bool
- HandleLocal bool
- UseRecvMMsg bool
+ FD int
+ MTU uint32
+ EthernetHeader bool
+ ChecksumOffload bool
+ ClosedFunc func(*tcpip.Error)
+ Address tcpip.LinkAddress
+ SaveRestore bool
+ DisconnectOk bool
+ HandleLocal bool
+ PacketDispatchMode PacketDispatchMode
}
// New creates a new fd-based endpoint.
@@ -133,21 +162,31 @@ func New(opts *Options) tcpip.LinkEndpointID {
}
e := &endpoint{
- fd: opts.FD,
- mtu: opts.MTU,
- caps: caps,
- closed: opts.ClosedFunc,
- addr: opts.Address,
- hdrSize: hdrSize,
- handleLocal: opts.HandleLocal,
- useRecvMMsg: opts.UseRecvMMsg,
+ fd: opts.FD,
+ mtu: opts.MTU,
+ caps: caps,
+ closed: opts.ClosedFunc,
+ addr: opts.Address,
+ hdrSize: hdrSize,
+ handleLocal: opts.HandleLocal,
+ packetDispatchMode: opts.PacketDispatchMode,
+ }
+
+ if isSocketFD(opts.FD) && e.packetDispatchMode == PacketMMap {
+ if err := e.setupPacketRXRing(); err != nil {
+ // TODO: replace panic with an error return.
+ panic(fmt.Sprintf("e.setupPacketRXRing failed: %v", err))
+ }
+ e.inboundDispatcher = e.packetMMapDispatch
+ return stack.RegisterLinkEndpoint(e)
}
+
// For non-socket FDs we read one packet a time (e.g. TAP devices)
msgsPerRecv := 1
e.inboundDispatcher = e.dispatch
// If the provided FD is a socket then we optimize packet reads by
// using recvmmsg() instead of read() to read packets in a batch.
- if isSocketFD(opts.FD) && e.useRecvMMsg {
+ if isSocketFD(opts.FD) && e.packetDispatchMode == RecvMMsg {
e.inboundDispatcher = e.recvMMsgDispatch
msgsPerRecv = MaxMsgsPerRecv
}
@@ -165,6 +204,7 @@ func New(opts *Options) tcpip.LinkEndpointID {
e.msgHdrs[i].Msg.Iov = &e.iovecs[i][0]
e.msgHdrs[i].Msg.Iovlen = uint64(len(BufConfig))
}
+
return stack.RegisterLinkEndpoint(e)
}