diff options
author | Bhasker Hariharan <bhaskerh@google.com> | 2019-05-21 15:23:12 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2019-05-21 15:24:25 -0700 |
commit | 2ac0aeeb42ef517743a18224f9f6945c1c77d732 (patch) | |
tree | 3cc0942ae3948485013bc30dbb1404305b501209 /pkg/tcpip | |
parent | 9cdae51feca5cee9faa198161b92a0aeece52d6c (diff) |
Refactor fdbased endpoint dispatcher code.
This is in preparation to support an fdbased endpoint that can read/dispatch
packets from multiple underlying fds.
Updates #231
PiperOrigin-RevId: 249337074
Change-Id: Id7d375186cffcf55ae5e38986e7d605a96916d35
Diffstat (limited to 'pkg/tcpip')
-rw-r--r-- | pkg/tcpip/link/fdbased/BUILD | 2 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/endpoint.go | 239 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/endpoint_test.go | 78 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/mmap.go | 12 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/mmap_amd64.go | 194 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go | 174 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/packet_dispatchers.go | 309 |
7 files changed, 598 insertions, 410 deletions
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD index 50ce91a4e..cef98c353 100644 --- a/pkg/tcpip/link/fdbased/BUILD +++ b/pkg/tcpip/link/fdbased/BUILD @@ -8,7 +8,9 @@ go_library( "endpoint.go", "endpoint_unsafe.go", "mmap.go", + "mmap_amd64.go", "mmap_amd64_unsafe.go", + "packet_dispatchers.go", ], importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased", visibility = [ diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 4da376774..1f889c2a0 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -34,18 +34,11 @@ import ( "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" ) -const ( - // MaxMsgsPerRecv is the maximum number of packets we want to retrieve - // in a single RecvMMsg call. - MaxMsgsPerRecv = 8 -) - -// BufConfig defines the shape of the vectorised view used to read packets from the NIC. -var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768} - // linkDispatcher reads packets from the link FD and dispatches them to the // NetworkDispatcher. -type linkDispatcher func() (bool, *tcpip.Error) +type linkDispatcher interface { + dispatch() (bool, *tcpip.Error) +} // PacketDispatchMode are the various supported methods of receiving and // dispatching packets from the underlying FD. @@ -92,11 +85,6 @@ type endpoint struct { // its end of the communication pipe. closed func(*tcpip.Error) - views [][]buffer.View - iovecs [][]syscall.Iovec - // msgHdrs is only used by the RecvMMsg dispatcher. - msgHdrs []rawfile.MMsgHdr - inboundDispatcher linkDispatcher dispatcher stack.NetworkDispatcher @@ -104,14 +92,6 @@ type endpoint struct { // endpoint. packetDispatchMode PacketDispatchMode - // ringBuffer is only used when PacketMMap dispatcher is used and points - // to the start of the mmapped PACKET_RX_RING buffer. - ringBuffer []byte - - // ringOffset is the current offset into the ring buffer where the next - // inbound packet will be placed by the kernel. - ringOffset int - // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is // disabled. gsoMaxSize uint32 @@ -174,11 +154,7 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) { packetDispatchMode: opts.PacketDispatchMode, } - // For non-socket FDs we read one packet a time (e.g. TAP devices). - msgsPerRecv := 1 - e.inboundDispatcher = e.dispatch - - isSocket, err := isSocketFD(opts.FD) + isSocket, err := isSocketFD(e.fd) if err != nil { return 0, err } @@ -187,44 +163,41 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) { e.caps |= stack.CapabilityGSO e.gsoMaxSize = opts.GSOMaxSize } + } + e.inboundDispatcher, err = createInboundDispatcher(e, isSocket) + if err != nil { + return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err) + } + + return stack.RegisterLinkEndpoint(e), nil +} +func createInboundDispatcher(e *endpoint, isSocket bool) (linkDispatcher, error) { + // By default use the readv() dispatcher as it works with all kinds of + // FDs (tap/tun/unix domain sockets and af_packet). + inboundDispatcher, err := newReadVDispatcher(e.fd, e) + if err != nil { + return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", e.fd, e, err) + } + + if isSocket { switch e.packetDispatchMode { case PacketMMap: - if err := e.setupPacketRXRing(); err != nil { - return 0, fmt.Errorf("e.setupPacketRXRing failed: %v", err) + inboundDispatcher, err = newPacketMMapDispatcher(e.fd, e) + if err != nil { + return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", e.fd, e, err) } - e.inboundDispatcher = e.packetMMapDispatch - return stack.RegisterLinkEndpoint(e), nil - case RecvMMsg: // If the provided FD is a socket then we optimize // packet reads by using recvmmsg() instead of read() to // read packets in a batch. - e.inboundDispatcher = e.recvMMsgDispatch - msgsPerRecv = MaxMsgsPerRecv + inboundDispatcher, err = newRecvMMsgDispatcher(e.fd, e) + if err != nil { + return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", e.fd, e, err) + } } } - - e.views = make([][]buffer.View, msgsPerRecv) - for i := range e.views { - e.views[i] = make([]buffer.View, len(BufConfig)) - } - e.iovecs = make([][]syscall.Iovec, msgsPerRecv) - iovLen := len(BufConfig) - if e.Capabilities()&stack.CapabilityGSO != 0 { - // virtioNetHdr is prepended before each packet. - iovLen++ - } - for i := range e.iovecs { - e.iovecs[i] = make([]syscall.Iovec, iovLen) - } - e.msgHdrs = make([]rawfile.MMsgHdr, msgsPerRecv) - for i := range e.msgHdrs { - e.msgHdrs[i].Msg.Iov = &e.iovecs[i][0] - e.msgHdrs[i].Msg.Iovlen = uint64(iovLen) - } - - return stack.RegisterLinkEndpoint(e), nil + return inboundDispatcher, nil } func isSocketFD(fd int) (bool, error) { @@ -347,163 +320,11 @@ func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Erro return rawfile.NonBlockingWrite(e.fd, packet) } -func (e *endpoint) capViews(k, n int, buffers []int) int { - c := 0 - for i, s := range buffers { - c += s - if c >= n { - e.views[k][i].CapLength(s - (c - n)) - return i + 1 - } - } - return len(buffers) -} - -func (e *endpoint) allocateViews(bufConfig []int) { - for k := 0; k < len(e.views); k++ { - var vnetHdr [virtioNetHdrSize]byte - vnetHdrOff := 0 - if e.Capabilities()&stack.CapabilityGSO != 0 { - // The kernel adds virtioNetHdr before each packet, but - // we don't use it, so so we allocate a buffer for it, - // add it in iovecs but don't add it in a view. - e.iovecs[k][0] = syscall.Iovec{ - Base: &vnetHdr[0], - Len: uint64(virtioNetHdrSize), - } - vnetHdrOff++ - } - for i := 0; i < len(bufConfig); i++ { - if e.views[k][i] != nil { - break - } - b := buffer.NewView(bufConfig[i]) - e.views[k][i] = b - e.iovecs[k][i+vnetHdrOff] = syscall.Iovec{ - Base: &b[0], - Len: uint64(len(b)), - } - } - } -} - -// dispatch reads one packet from the file descriptor and dispatches it. -func (e *endpoint) dispatch() (bool, *tcpip.Error) { - e.allocateViews(BufConfig) - - n, err := rawfile.BlockingReadv(e.fd, e.iovecs[0]) - if err != nil { - return false, err - } - if e.Capabilities()&stack.CapabilityGSO != 0 { - // Skip virtioNetHdr which is added before each packet, it - // isn't used and it isn't in a view. - n -= virtioNetHdrSize - } - if n <= e.hdrSize { - return false, nil - } - - var ( - p tcpip.NetworkProtocolNumber - remote, local tcpip.LinkAddress - ) - if e.hdrSize > 0 { - eth := header.Ethernet(e.views[0][0]) - p = eth.Type() - remote = eth.SourceAddress() - local = eth.DestinationAddress() - } else { - // We don't get any indication of what the packet is, so try to guess - // if it's an IPv4 or IPv6 packet. - switch header.IPVersion(e.views[0][0]) { - case header.IPv4Version: - p = header.IPv4ProtocolNumber - case header.IPv6Version: - p = header.IPv6ProtocolNumber - default: - return true, nil - } - } - - used := e.capViews(0, n, BufConfig) - vv := buffer.NewVectorisedView(n, e.views[0][:used]) - vv.TrimFront(e.hdrSize) - - e.dispatcher.DeliverNetworkPacket(e, remote, local, p, vv) - - // Prepare e.views for another packet: release used views. - for i := 0; i < used; i++ { - e.views[0][i] = nil - } - - return true, nil -} - -// recvMMsgDispatch reads more than one packet at a time from the file -// descriptor and dispatches it. -func (e *endpoint) recvMMsgDispatch() (bool, *tcpip.Error) { - e.allocateViews(BufConfig) - - nMsgs, err := rawfile.BlockingRecvMMsg(e.fd, e.msgHdrs) - if err != nil { - return false, err - } - // Process each of received packets. - for k := 0; k < nMsgs; k++ { - n := int(e.msgHdrs[k].Len) - if e.Capabilities()&stack.CapabilityGSO != 0 { - n -= virtioNetHdrSize - } - if n <= e.hdrSize { - return false, nil - } - - var ( - p tcpip.NetworkProtocolNumber - remote, local tcpip.LinkAddress - ) - if e.hdrSize > 0 { - eth := header.Ethernet(e.views[k][0]) - p = eth.Type() - remote = eth.SourceAddress() - local = eth.DestinationAddress() - } else { - // We don't get any indication of what the packet is, so try to guess - // if it's an IPv4 or IPv6 packet. - switch header.IPVersion(e.views[k][0]) { - case header.IPv4Version: - p = header.IPv4ProtocolNumber - case header.IPv6Version: - p = header.IPv6ProtocolNumber - default: - return true, nil - } - } - - used := e.capViews(k, int(n), BufConfig) - vv := buffer.NewVectorisedView(int(n), e.views[k][:used]) - vv.TrimFront(e.hdrSize) - e.dispatcher.DeliverNetworkPacket(e, remote, local, p, vv) - - // Prepare e.views for another packet: release used views. - for i := 0; i < used; i++ { - e.views[k][i] = nil - } - } - - for k := 0; k < nMsgs; k++ { - e.msgHdrs[k].Len = 0 - } - - return true, nil -} - // dispatchLoop reads packets from the file descriptor in a loop and dispatches // them to the network stack. func (e *endpoint) dispatchLoop() *tcpip.Error { for { - cont, err := e.inboundDispatcher() + cont, err := e.inboundDispatcher.dispatch() if err != nil || !cont { if e.closed != nil { e.closed(err) diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go index 31138e4ac..fd1722074 100644 --- a/pkg/tcpip/link/fdbased/endpoint_test.go +++ b/pkg/tcpip/link/fdbased/endpoint_test.go @@ -357,28 +357,6 @@ func TestBufConfigFirst(t *testing.T) { } } -func build(bufConfig []int) *endpoint { - e := &endpoint{ - views: make([][]buffer.View, MaxMsgsPerRecv), - iovecs: make([][]syscall.Iovec, MaxMsgsPerRecv), - msgHdrs: make([]rawfile.MMsgHdr, MaxMsgsPerRecv), - } - - for i, _ := range e.views { - e.views[i] = make([]buffer.View, len(bufConfig)) - } - for i := range e.iovecs { - e.iovecs[i] = make([]syscall.Iovec, len(bufConfig)) - } - for k, msgHdr := range e.msgHdrs { - msgHdr.Msg.Iov = &e.iovecs[k][0] - msgHdr.Msg.Iovlen = uint64(len(bufConfig)) - } - - e.allocateViews(bufConfig) - return e -} - var capLengthTestCases = []struct { comment string config []int @@ -416,19 +394,61 @@ var capLengthTestCases = []struct { }, } -func TestCapLength(t *testing.T) { +func TestReadVDispatcherCapLength(t *testing.T) { + for _, c := range capLengthTestCases { + // fd does not matter for this test. + d := readVDispatcher{fd: -1, e: &endpoint{}} + d.views = make([]buffer.View, len(c.config)) + d.iovecs = make([]syscall.Iovec, len(c.config)) + d.allocateViews(c.config) + + used := d.capViews(c.n, c.config) + if used != c.wantUsed { + t.Errorf("Test %q failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed) + } + lengths := make([]int, len(d.views)) + for i, v := range d.views { + lengths[i] = len(v) + } + if !reflect.DeepEqual(lengths, c.wantLengths) { + t.Errorf("Test %q failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths) + } + } +} + +func TestRecvMMsgDispatcherCapLength(t *testing.T) { for _, c := range capLengthTestCases { - e := build(c.config) - used := e.capViews(0, c.n, c.config) + d := recvMMsgDispatcher{ + fd: -1, // fd does not matter for this test. + e: &endpoint{}, + views: make([][]buffer.View, 1), + iovecs: make([][]syscall.Iovec, 1), + msgHdrs: make([]rawfile.MMsgHdr, 1), + } + + for i, _ := range d.views { + d.views[i] = make([]buffer.View, len(c.config)) + } + for i := range d.iovecs { + d.iovecs[i] = make([]syscall.Iovec, len(c.config)) + } + for k, msgHdr := range d.msgHdrs { + msgHdr.Msg.Iov = &d.iovecs[k][0] + msgHdr.Msg.Iovlen = uint64(len(c.config)) + } + + d.allocateViews(c.config) + + used := d.capViews(0, c.n, c.config) if used != c.wantUsed { - t.Errorf("Test \"%s\" failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed) + t.Errorf("Test %q failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed) } - lengths := make([]int, len(e.views[0])) - for i, v := range e.views[0] { + lengths := make([]int, len(d.views[0])) + for i, v := range d.views[0] { lengths[i] = len(v) } if !reflect.DeepEqual(lengths, c.wantLengths) { - t.Errorf("Test \"%s\" failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths) + t.Errorf("Test %q failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths) } } diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go index 430c85a42..6b7f2a185 100644 --- a/pkg/tcpip/link/fdbased/mmap.go +++ b/pkg/tcpip/link/fdbased/mmap.go @@ -18,16 +18,8 @@ package fdbased import "gvisor.googlesource.com/gvisor/pkg/tcpip" -// Stubbed out versions for non-linux/non-amd64 platforms. +// Stubbed out version for non-linux/non-amd64 platforms. -func (e *endpoint) setupPacketRXRing() error { - return nil -} - -func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) { +func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, *tcpip.Error) { return nil, nil } - -func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) { - return false, nil -} diff --git a/pkg/tcpip/link/fdbased/mmap_amd64.go b/pkg/tcpip/link/fdbased/mmap_amd64.go new file mode 100644 index 000000000..1c2d8c468 --- /dev/null +++ b/pkg/tcpip/link/fdbased/mmap_amd64.go @@ -0,0 +1,194 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux,amd64 + +package fdbased + +import ( + "encoding/binary" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/header" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile" +) + +const ( + tPacketAlignment = uintptr(16) + tpStatusKernel = 0 + tpStatusUser = 1 + tpStatusCopy = 2 + tpStatusLosing = 4 +) + +// We overallocate the frame size to accommodate space for the +// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding. +// +// Memory allocated for the ring buffer: tpBlockSize * tpBlockNR = 2 MiB +// +// NOTE: +// Frames need to be aligned at 16 byte boundaries. +// BlockSize needs to be page aligned. +// +// For details see PACKET_MMAP setting constraints in +// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt +const ( + tpFrameSize = 65536 + 128 + tpBlockSize = tpFrameSize * 32 + tpBlockNR = 1 + tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize +) + +// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct +// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>. +func tPacketAlign(v uintptr) uintptr { + return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1)) +} + +// tPacketReq is the tpacket_req structure as described in +// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt +type tPacketReq struct { + tpBlockSize uint32 + tpBlockNR uint32 + tpFrameSize uint32 + tpFrameNR uint32 +} + +// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h> +type tPacketHdr []byte + +const ( + tpStatusOffset = 0 + tpLenOffset = 8 + tpSnapLenOffset = 12 + tpMacOffset = 16 + tpNetOffset = 18 + tpSecOffset = 20 + tpUSecOffset = 24 +) + +func (t tPacketHdr) tpLen() uint32 { + return binary.LittleEndian.Uint32(t[tpLenOffset:]) +} + +func (t tPacketHdr) tpSnapLen() uint32 { + return binary.LittleEndian.Uint32(t[tpSnapLenOffset:]) +} + +func (t tPacketHdr) tpMac() uint16 { + return binary.LittleEndian.Uint16(t[tpMacOffset:]) +} + +func (t tPacketHdr) tpNet() uint16 { + return binary.LittleEndian.Uint16(t[tpNetOffset:]) +} + +func (t tPacketHdr) tpSec() uint32 { + return binary.LittleEndian.Uint32(t[tpSecOffset:]) +} + +func (t tPacketHdr) tpUSec() uint32 { + return binary.LittleEndian.Uint32(t[tpUSecOffset:]) +} + +func (t tPacketHdr) Payload() []byte { + return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()] +} + +// packetMMapDispatcher uses PACKET_RX_RING's to read/dispatch inbound packets. +// See: mmap_amd64_unsafe.go for implementation details. +type packetMMapDispatcher struct { + // fd is the file descriptor used to send and receive packets. + fd int + + // e is the endpoint this dispatcher is attached to. + e *endpoint + + // ringBuffer is only used when PacketMMap dispatcher is used and points + // to the start of the mmapped PACKET_RX_RING buffer. + ringBuffer []byte + + // ringOffset is the current offset into the ring buffer where the next + // inbound packet will be placed by the kernel. + ringOffset int +} + +func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, *tcpip.Error) { + hdr := tPacketHdr(d.ringBuffer[d.ringOffset*tpFrameSize:]) + for hdr.tpStatus()&tpStatusUser == 0 { + event := rawfile.PollEvent{ + FD: int32(d.fd), + Events: unix.POLLIN | unix.POLLERR, + } + if _, errno := rawfile.BlockingPoll(&event, 1, -1); errno != 0 { + if errno == syscall.EINTR { + continue + } + return nil, rawfile.TranslateErrno(errno) + } + if hdr.tpStatus()&tpStatusCopy != 0 { + // This frame is truncated so skip it after flipping the + // buffer to the kernel. + hdr.setTPStatus(tpStatusKernel) + d.ringOffset = (d.ringOffset + 1) % tpFrameNR + hdr = (tPacketHdr)(d.ringBuffer[d.ringOffset*tpFrameSize:]) + continue + } + } + + // Copy out the packet from the mmapped frame to a locally owned buffer. + pkt := make([]byte, hdr.tpSnapLen()) + copy(pkt, hdr.Payload()) + // Release packet to kernel. + hdr.setTPStatus(tpStatusKernel) + d.ringOffset = (d.ringOffset + 1) % tpFrameNR + return pkt, nil +} + +// dispatch reads packets from an mmaped ring buffer and dispatches them to the +// network stack. +func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) { + pkt, err := d.readMMappedPacket() + if err != nil { + return false, err + } + var ( + p tcpip.NetworkProtocolNumber + remote, local tcpip.LinkAddress + ) + if d.e.hdrSize > 0 { + eth := header.Ethernet(pkt) + p = eth.Type() + remote = eth.SourceAddress() + local = eth.DestinationAddress() + } else { + // We don't get any indication of what the packet is, so try to guess + // if it's an IPv4 or IPv6 packet. + switch header.IPVersion(pkt) { + case header.IPv4Version: + p = header.IPv4ProtocolNumber + case header.IPv6Version: + p = header.IPv6ProtocolNumber + default: + return true, nil + } + } + + pkt = pkt[d.e.hdrSize:] + d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)})) + return true, nil +} diff --git a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go index 135da2498..47cb1d1cc 100644 --- a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go +++ b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go @@ -17,76 +17,17 @@ package fdbased import ( - "encoding/binary" "fmt" "sync/atomic" "syscall" "unsafe" "golang.org/x/sys/unix" - "gvisor.googlesource.com/gvisor/pkg/tcpip" - "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" - "gvisor.googlesource.com/gvisor/pkg/tcpip/header" - "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile" ) -const ( - tPacketAlignment = uintptr(16) - tpStatusKernel = 0 - tpStatusUser = 1 - tpStatusCopy = 2 - tpStatusLosing = 4 -) - -// We overallocate the frame size to accommodate space for the -// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding. -// -// Memory allocated for the ring buffer: tpBlockSize * tpBlockNR = 2 MiB -// -// NOTE: -// Frames need to be aligned at 16 byte boundaries. -// BlockSize needs to be page aligned. -// -// For details see PACKET_MMAP setting constraints in -// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt -const ( - tpFrameSize = 65536 + 128 - tpBlockSize = tpFrameSize * 32 - tpBlockNR = 1 - tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize -) - -// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct -// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>. -func tPacketAlign(v uintptr) uintptr { - return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1)) -} - // tPacketHdrlen is the TPACKET_HDRLEN variable defined in <linux/if_packet.h>. var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(syscall.RawSockaddrLinklayer{})) -// tPacketReq is the tpacket_req structure as described in -// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt -type tPacketReq struct { - tpBlockSize uint32 - tpBlockNR uint32 - tpFrameSize uint32 - tpFrameNR uint32 -} - -// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h> -type tPacketHdr []byte - -const ( - tpStatusOffset = 0 - tpLenOffset = 8 - tpSnapLenOffset = 12 - tpMacOffset = 16 - tpNetOffset = 18 - tpSecOffset = 20 - tpUSecOffset = 24 -) - // tpStatus returns the frame status field. // The status is concurrently updated by the kernel as a result we must // use atomic operations to prevent races. @@ -105,38 +46,14 @@ func (t tPacketHdr) setTPStatus(status uint32) { atomic.StoreUint32((*uint32)(statusPtr), status) } -func (t tPacketHdr) tpLen() uint32 { - return binary.LittleEndian.Uint32(t[tpLenOffset:]) -} - -func (t tPacketHdr) tpSnapLen() uint32 { - return binary.LittleEndian.Uint32(t[tpSnapLenOffset:]) -} - -func (t tPacketHdr) tpMac() uint16 { - return binary.LittleEndian.Uint16(t[tpMacOffset:]) -} - -func (t tPacketHdr) tpNet() uint16 { - return binary.LittleEndian.Uint16(t[tpNetOffset:]) -} - -func (t tPacketHdr) tpSec() uint32 { - return binary.LittleEndian.Uint32(t[tpSecOffset:]) -} - -func (t tPacketHdr) tpUSec() uint32 { - return binary.LittleEndian.Uint32(t[tpUSecOffset:]) -} - -func (t tPacketHdr) Payload() []byte { - return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()] -} - -func (e *endpoint) setupPacketRXRing() error { +func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, error) { + d := &packetMMapDispatcher{ + fd: fd, + e: e, + } pageSize := unix.Getpagesize() if tpBlockSize%pageSize != 0 { - return fmt.Errorf("tpBlockSize: %d is not page aligned, pagesize: %d", tpBlockSize, pageSize) + return nil, fmt.Errorf("tpBlockSize: %d is not page aligned, pagesize: %d", tpBlockSize, pageSize) } tReq := tPacketReq{ tpBlockSize: uint32(tpBlockSize), @@ -145,84 +62,17 @@ func (e *endpoint) setupPacketRXRing() error { tpFrameNR: uint32(tpFrameNR), } // Setup PACKET_RX_RING. - if err := setsockopt(e.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil { - return fmt.Errorf("failed to enable PACKET_RX_RING: %v", err) + if err := setsockopt(d.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil { + return nil, fmt.Errorf("failed to enable PACKET_RX_RING: %v", err) } // Let's mmap the blocks. sz := tpBlockSize * tpBlockNR - buf, err := syscall.Mmap(e.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) - if err != nil { - return fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err) - } - e.ringBuffer = buf - return nil -} - -func (e *endpoint) readMMappedPacket() ([]byte, *tcpip.Error) { - hdr := (tPacketHdr)(e.ringBuffer[e.ringOffset*tpFrameSize:]) - for hdr.tpStatus()&tpStatusUser == 0 { - event := rawfile.PollEvent{ - FD: int32(e.fd), - Events: unix.POLLIN | unix.POLLERR, - } - _, errno := rawfile.BlockingPoll(&event, 1, -1) - if errno != 0 { - if errno == syscall.EINTR { - continue - } - return nil, rawfile.TranslateErrno(errno) - } - if hdr.tpStatus()&tpStatusCopy != 0 { - // This frame is truncated so skip it after flipping the - // buffer to the kernel. - hdr.setTPStatus(tpStatusKernel) - e.ringOffset = (e.ringOffset + 1) % tpFrameNR - hdr = (tPacketHdr)(e.ringBuffer[e.ringOffset*tpFrameSize:]) - continue - } - } - - // Copy out the packet from the mmapped frame to a locally owned buffer. - pkt := make([]byte, hdr.tpSnapLen()) - copy(pkt, hdr.Payload()) - // Release packet to kernel. - hdr.setTPStatus(tpStatusKernel) - e.ringOffset = (e.ringOffset + 1) % tpFrameNR - return pkt, nil -} - -// packetMMapDispatch reads packets from an mmaped ring buffer and dispatches -// them to the network stack. -func (e *endpoint) packetMMapDispatch() (bool, *tcpip.Error) { - pkt, err := e.readMMappedPacket() + buf, err := syscall.Mmap(d.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) if err != nil { - return false, err + return nil, fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err) } - var ( - p tcpip.NetworkProtocolNumber - remote, local tcpip.LinkAddress - ) - if e.hdrSize > 0 { - eth := header.Ethernet(pkt) - p = eth.Type() - remote = eth.SourceAddress() - local = eth.DestinationAddress() - } else { - // We don't get any indication of what the packet is, so try to guess - // if it's an IPv4 or IPv6 packet. - switch header.IPVersion(pkt) { - case header.IPv4Version: - p = header.IPv4ProtocolNumber - case header.IPv6Version: - p = header.IPv6ProtocolNumber - default: - return true, nil - } - } - - pkt = pkt[e.hdrSize:] - e.dispatcher.DeliverNetworkPacket(e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)})) - return true, nil + d.ringBuffer = buf + return d, nil } func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error { diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go new file mode 100644 index 000000000..1ae0e3359 --- /dev/null +++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go @@ -0,0 +1,309 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux + +package fdbased + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/header" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile" + "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" +) + +// BufConfig defines the shape of the vectorised view used to read packets from the NIC. +var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768} + +// readVDispatcher uses readv() system call to read inbound packets and +// dispatches them. +type readVDispatcher struct { + // fd is the file descriptor used to send and receive packets. + fd int + + // e is the endpoint this dispatcher is attached to. + e *endpoint + + // views are the actual buffers that hold the packet contents. + views []buffer.View + + // iovecs are initialized with base pointers/len of the corresponding + // entries in the views defined above, except when GSO is enabled then + // the first iovec points to a buffer for the vnet header which is + // stripped before the views are passed up the stack for further + // processing. + iovecs []syscall.Iovec +} + +func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { + d := &readVDispatcher{fd: fd, e: e} + d.views = make([]buffer.View, len(BufConfig)) + iovLen := len(BufConfig) + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + iovLen++ + } + d.iovecs = make([]syscall.Iovec, iovLen) + return d, nil +} + +func (d *readVDispatcher) allocateViews(bufConfig []int) { + var vnetHdr [virtioNetHdrSize]byte + vnetHdrOff := 0 + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + // The kernel adds virtioNetHdr before each packet, but + // we don't use it, so so we allocate a buffer for it, + // add it in iovecs but don't add it in a view. + d.iovecs[0] = syscall.Iovec{ + Base: &vnetHdr[0], + Len: uint64(virtioNetHdrSize), + } + vnetHdrOff++ + } + for i := 0; i < len(bufConfig); i++ { + if d.views[i] != nil { + break + } + b := buffer.NewView(bufConfig[i]) + d.views[i] = b + d.iovecs[i+vnetHdrOff] = syscall.Iovec{ + Base: &b[0], + Len: uint64(len(b)), + } + } +} + +func (d *readVDispatcher) capViews(n int, buffers []int) int { + c := 0 + for i, s := range buffers { + c += s + if c >= n { + d.views[i].CapLength(s - (c - n)) + return i + 1 + } + } + return len(buffers) +} + +// dispatch reads one packet from the file descriptor and dispatches it. +func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { + d.allocateViews(BufConfig) + + n, err := rawfile.BlockingReadv(d.fd, d.iovecs) + if err != nil { + return false, err + } + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + // Skip virtioNetHdr which is added before each packet, it + // isn't used and it isn't in a view. + n -= virtioNetHdrSize + } + if n <= d.e.hdrSize { + return false, nil + } + + var ( + p tcpip.NetworkProtocolNumber + remote, local tcpip.LinkAddress + ) + if d.e.hdrSize > 0 { + eth := header.Ethernet(d.views[0]) + p = eth.Type() + remote = eth.SourceAddress() + local = eth.DestinationAddress() + } else { + // We don't get any indication of what the packet is, so try to guess + // if it's an IPv4 or IPv6 packet. + switch header.IPVersion(d.views[0]) { + case header.IPv4Version: + p = header.IPv4ProtocolNumber + case header.IPv6Version: + p = header.IPv6ProtocolNumber + default: + return true, nil + } + } + + used := d.capViews(n, BufConfig) + vv := buffer.NewVectorisedView(n, d.views[:used]) + vv.TrimFront(d.e.hdrSize) + + d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv) + + // Prepare e.views for another packet: release used views. + for i := 0; i < used; i++ { + d.views[i] = nil + } + + return true, nil +} + +// recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and +// dispatches them. +type recvMMsgDispatcher struct { + // fd is the file descriptor used to send and receive packets. + fd int + + // e is the endpoint this dispatcher is attached to. + e *endpoint + + // views is an array of array of buffers that contain packet contents. + views [][]buffer.View + + // iovecs is an array of array of iovec records where each iovec base + // pointer and length are initialzed to the corresponding view above, + // except when GSO is neabled then the first iovec in each array of + // iovecs points to a buffer for the vnet header which is stripped + // before the views are passed up the stack for further processing. + iovecs [][]syscall.Iovec + + // msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to + // reference an array of iovecs in the iovecs field defined above. This + // array is passed as the parameter to recvmmsg call to retrieve + // potentially more than 1 packet per syscall. + msgHdrs []rawfile.MMsgHdr +} + +const ( + // MaxMsgsPerRecv is the maximum number of packets we want to retrieve + // in a single RecvMMsg call. + MaxMsgsPerRecv = 8 +) + +func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) { + d := &recvMMsgDispatcher{ + fd: fd, + e: e, + } + d.views = make([][]buffer.View, MaxMsgsPerRecv) + for i := range d.views { + d.views[i] = make([]buffer.View, len(BufConfig)) + } + d.iovecs = make([][]syscall.Iovec, MaxMsgsPerRecv) + iovLen := len(BufConfig) + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + // virtioNetHdr is prepended before each packet. + iovLen++ + } + for i := range d.iovecs { + d.iovecs[i] = make([]syscall.Iovec, iovLen) + } + d.msgHdrs = make([]rawfile.MMsgHdr, MaxMsgsPerRecv) + for i := range d.msgHdrs { + d.msgHdrs[i].Msg.Iov = &d.iovecs[i][0] + d.msgHdrs[i].Msg.Iovlen = uint64(iovLen) + } + return d, nil +} + +func (d *recvMMsgDispatcher) capViews(k, n int, buffers []int) int { + c := 0 + for i, s := range buffers { + c += s + if c >= n { + d.views[k][i].CapLength(s - (c - n)) + return i + 1 + } + } + return len(buffers) +} + +func (d *recvMMsgDispatcher) allocateViews(bufConfig []int) { + for k := 0; k < len(d.views); k++ { + var vnetHdr [virtioNetHdrSize]byte + vnetHdrOff := 0 + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + // The kernel adds virtioNetHdr before each packet, but + // we don't use it, so so we allocate a buffer for it, + // add it in iovecs but don't add it in a view. + d.iovecs[k][0] = syscall.Iovec{ + Base: &vnetHdr[0], + Len: uint64(virtioNetHdrSize), + } + vnetHdrOff++ + } + for i := 0; i < len(bufConfig); i++ { + if d.views[k][i] != nil { + break + } + b := buffer.NewView(bufConfig[i]) + d.views[k][i] = b + d.iovecs[k][i+vnetHdrOff] = syscall.Iovec{ + Base: &b[0], + Len: uint64(len(b)), + } + } + } +} + +// recvMMsgDispatch reads more than one packet at a time from the file +// descriptor and dispatches it. +func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { + d.allocateViews(BufConfig) + + nMsgs, err := rawfile.BlockingRecvMMsg(d.fd, d.msgHdrs) + if err != nil { + return false, err + } + // Process each of received packets. + for k := 0; k < nMsgs; k++ { + n := int(d.msgHdrs[k].Len) + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + n -= virtioNetHdrSize + } + if n <= d.e.hdrSize { + return false, nil + } + + var ( + p tcpip.NetworkProtocolNumber + remote, local tcpip.LinkAddress + ) + if d.e.hdrSize > 0 { + eth := header.Ethernet(d.views[k][0]) + p = eth.Type() + remote = eth.SourceAddress() + local = eth.DestinationAddress() + } else { + // We don't get any indication of what the packet is, so try to guess + // if it's an IPv4 or IPv6 packet. + switch header.IPVersion(d.views[k][0]) { + case header.IPv4Version: + p = header.IPv4ProtocolNumber + case header.IPv6Version: + p = header.IPv6ProtocolNumber + default: + return true, nil + } + } + + used := d.capViews(k, int(n), BufConfig) + vv := buffer.NewVectorisedView(int(n), d.views[k][:used]) + vv.TrimFront(d.e.hdrSize) + d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv) + + // Prepare e.views for another packet: release used views. + for i := 0; i < used; i++ { + d.views[k][i] = nil + } + } + + for k := 0; k < nMsgs; k++ { + d.msgHdrs[k].Len = 0 + } + + return true, nil +} |