diff options
author | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
commit | ceb0d792f328d1fc0692197d8856a43c3936a571 (patch) | |
tree | 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/tcpip/link/fdbased | |
parent | deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff) | |
parent | 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff) |
Merge 216da0b7 (automated)
Diffstat (limited to 'pkg/tcpip/link/fdbased')
-rw-r--r-- | pkg/tcpip/link/fdbased/endpoint.go | 372 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/endpoint_unsafe.go | 32 | ||||
-rwxr-xr-x | pkg/tcpip/link/fdbased/fdbased_state_autogen.go | 4 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/mmap.go | 25 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/mmap_amd64.go | 194 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go | 84 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/packet_dispatchers.go | 309 |
7 files changed, 1020 insertions, 0 deletions
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go new file mode 100644 index 000000000..1f889c2a0 --- /dev/null +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -0,0 +1,372 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux + +// Package fdbased provides the implemention of data-link layer endpoints +// backed by boundary-preserving file descriptors (e.g., TUN devices, +// seqpacket/datagram sockets). +// +// FD based endpoints can be used in the networking stack by calling New() to +// create a new endpoint, and then passing it as an argument to +// Stack.CreateNIC(). +package fdbased + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/header" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile" + "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" +) + +// linkDispatcher reads packets from the link FD and dispatches them to the +// NetworkDispatcher. +type linkDispatcher interface { + dispatch() (bool, *tcpip.Error) +} + +// PacketDispatchMode are the various supported methods of receiving and +// dispatching packets from the underlying FD. +type PacketDispatchMode int + +const ( + // Readv is the default dispatch mode and is the least performant of the + // dispatch options but the one that is supported by all underlying FD + // types. + Readv PacketDispatchMode = iota + // RecvMMsg enables use of recvmmsg() syscall instead of readv() to + // read inbound packets. This reduces # of syscalls needed to process + // packets. + // + // NOTE: recvmmsg() is only supported for sockets, so if the underlying + // FD is not a socket then the code will still fall back to the readv() + // path. + RecvMMsg + // PacketMMap enables use of PACKET_RX_RING to receive packets from the + // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The + // primary use-case for this is runsc which uses an AF_PACKET FD to + // receive packets from the veth device. + PacketMMap +) + +type endpoint struct { + // fd is the file descriptor used to send and receive packets. + fd int + + // mtu (maximum transmission unit) is the maximum size of a packet. + mtu uint32 + + // hdrSize specifies the link-layer header size. If set to 0, no header + // is added/removed; otherwise an ethernet header is used. + hdrSize int + + // addr is the address of the endpoint. + addr tcpip.LinkAddress + + // caps holds the endpoint capabilities. + caps stack.LinkEndpointCapabilities + + // closed is a function to be called when the FD's peer (if any) closes + // its end of the communication pipe. + closed func(*tcpip.Error) + + inboundDispatcher linkDispatcher + dispatcher stack.NetworkDispatcher + + // packetDispatchMode controls the packet dispatcher used by this + // endpoint. + packetDispatchMode PacketDispatchMode + + // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is + // disabled. + gsoMaxSize uint32 +} + +// Options specify the details about the fd-based endpoint to be created. +type Options struct { + FD int + MTU uint32 + EthernetHeader bool + ClosedFunc func(*tcpip.Error) + Address tcpip.LinkAddress + SaveRestore bool + DisconnectOk bool + GSOMaxSize uint32 + PacketDispatchMode PacketDispatchMode + TXChecksumOffload bool + RXChecksumOffload bool +} + +// New creates a new fd-based endpoint. +// +// Makes fd non-blocking, but does not take ownership of fd, which must remain +// open for the lifetime of the returned endpoint. +func New(opts *Options) (tcpip.LinkEndpointID, error) { + if err := syscall.SetNonblock(opts.FD, true); err != nil { + return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", opts.FD, err) + } + + caps := stack.LinkEndpointCapabilities(0) + if opts.RXChecksumOffload { + caps |= stack.CapabilityRXChecksumOffload + } + + if opts.TXChecksumOffload { + caps |= stack.CapabilityTXChecksumOffload + } + + hdrSize := 0 + if opts.EthernetHeader { + hdrSize = header.EthernetMinimumSize + caps |= stack.CapabilityResolutionRequired + } + + if opts.SaveRestore { + caps |= stack.CapabilitySaveRestore + } + + if opts.DisconnectOk { + caps |= stack.CapabilityDisconnectOk + } + + e := &endpoint{ + fd: opts.FD, + mtu: opts.MTU, + caps: caps, + closed: opts.ClosedFunc, + addr: opts.Address, + hdrSize: hdrSize, + packetDispatchMode: opts.PacketDispatchMode, + } + + isSocket, err := isSocketFD(e.fd) + if err != nil { + return 0, err + } + if isSocket { + if opts.GSOMaxSize != 0 { + e.caps |= stack.CapabilityGSO + e.gsoMaxSize = opts.GSOMaxSize + } + } + e.inboundDispatcher, err = createInboundDispatcher(e, isSocket) + if err != nil { + return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err) + } + + return stack.RegisterLinkEndpoint(e), nil +} + +func createInboundDispatcher(e *endpoint, isSocket bool) (linkDispatcher, error) { + // By default use the readv() dispatcher as it works with all kinds of + // FDs (tap/tun/unix domain sockets and af_packet). + inboundDispatcher, err := newReadVDispatcher(e.fd, e) + if err != nil { + return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", e.fd, e, err) + } + + if isSocket { + switch e.packetDispatchMode { + case PacketMMap: + inboundDispatcher, err = newPacketMMapDispatcher(e.fd, e) + if err != nil { + return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", e.fd, e, err) + } + case RecvMMsg: + // If the provided FD is a socket then we optimize + // packet reads by using recvmmsg() instead of read() to + // read packets in a batch. + inboundDispatcher, err = newRecvMMsgDispatcher(e.fd, e) + if err != nil { + return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", e.fd, e, err) + } + } + } + return inboundDispatcher, nil +} + +func isSocketFD(fd int) (bool, error) { + var stat syscall.Stat_t + if err := syscall.Fstat(fd, &stat); err != nil { + return false, fmt.Errorf("syscall.Fstat(%v,...) failed: %v", fd, err) + } + return (stat.Mode & syscall.S_IFSOCK) == syscall.S_IFSOCK, nil +} + +// Attach launches the goroutine that reads packets from the file descriptor and +// dispatches them via the provided dispatcher. +func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { + e.dispatcher = dispatcher + // Link endpoints are not savable. When transportation endpoints are + // saved, they stop sending outgoing packets and all incoming packets + // are rejected. + go e.dispatchLoop() // S/R-SAFE: See above. +} + +// IsAttached implements stack.LinkEndpoint.IsAttached. +func (e *endpoint) IsAttached() bool { + return e.dispatcher != nil +} + +// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized +// during construction. +func (e *endpoint) MTU() uint32 { + return e.mtu +} + +// Capabilities implements stack.LinkEndpoint.Capabilities. +func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { + return e.caps +} + +// MaxHeaderLength returns the maximum size of the link-layer header. +func (e *endpoint) MaxHeaderLength() uint16 { + return uint16(e.hdrSize) +} + +// LinkAddress returns the link address of this endpoint. +func (e *endpoint) LinkAddress() tcpip.LinkAddress { + return e.addr +} + +// virtioNetHdr is declared in linux/virtio_net.h. +type virtioNetHdr struct { + flags uint8 + gsoType uint8 + hdrLen uint16 + gsoSize uint16 + csumStart uint16 + csumOffset uint16 +} + +// These constants are declared in linux/virtio_net.h. +const ( + _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 + + _VIRTIO_NET_HDR_GSO_TCPV4 = 1 + _VIRTIO_NET_HDR_GSO_TCPV6 = 4 +) + +// WritePacket writes outbound packets to the file descriptor. If it is not +// currently writable, the packet is dropped. +func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { + if e.hdrSize > 0 { + // Add ethernet header if needed. + eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize)) + ethHdr := &header.EthernetFields{ + DstAddr: r.RemoteLinkAddress, + Type: protocol, + } + + // Preserve the src address if it's set in the route. + if r.LocalLinkAddress != "" { + ethHdr.SrcAddr = r.LocalLinkAddress + } else { + ethHdr.SrcAddr = e.addr + } + eth.Encode(ethHdr) + } + + if e.Capabilities()&stack.CapabilityGSO != 0 { + vnetHdr := virtioNetHdr{} + vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr) + if gso != nil { + vnetHdr.hdrLen = uint16(hdr.UsedLength()) + if gso.NeedsCsum { + vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM + vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen + vnetHdr.csumOffset = gso.CsumOffset + } + if gso.Type != stack.GSONone && uint16(payload.Size()) > gso.MSS { + switch gso.Type { + case stack.GSOTCPv4: + vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 + case stack.GSOTCPv6: + vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 + default: + panic(fmt.Sprintf("Unknown gso type: %v", gso.Type)) + } + vnetHdr.gsoSize = gso.MSS + } + } + + return rawfile.NonBlockingWrite3(e.fd, vnetHdrBuf, hdr.View(), payload.ToView()) + } + + if payload.Size() == 0 { + return rawfile.NonBlockingWrite(e.fd, hdr.View()) + } + + return rawfile.NonBlockingWrite3(e.fd, hdr.View(), payload.ToView(), nil) +} + +// WriteRawPacket writes a raw packet directly to the file descriptor. +func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error { + return rawfile.NonBlockingWrite(e.fd, packet) +} + +// dispatchLoop reads packets from the file descriptor in a loop and dispatches +// them to the network stack. +func (e *endpoint) dispatchLoop() *tcpip.Error { + for { + cont, err := e.inboundDispatcher.dispatch() + if err != nil || !cont { + if e.closed != nil { + e.closed(err) + } + return err + } + } +} + +// GSOMaxSize returns the maximum GSO packet size. +func (e *endpoint) GSOMaxSize() uint32 { + return e.gsoMaxSize +} + +// InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes +// to the FD, but does not read from it. All reads come from injected packets. +type InjectableEndpoint struct { + endpoint + + dispatcher stack.NetworkDispatcher +} + +// Attach saves the stack network-layer dispatcher for use later when packets +// are injected. +func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { + e.dispatcher = dispatcher +} + +// Inject injects an inbound packet. +func (e *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { + e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv) +} + +// NewInjectable creates a new fd-based InjectableEndpoint. +func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (tcpip.LinkEndpointID, *InjectableEndpoint) { + syscall.SetNonblock(fd, true) + + e := &InjectableEndpoint{endpoint: endpoint{ + fd: fd, + mtu: mtu, + caps: capabilities, + }} + + return stack.RegisterLinkEndpoint(e), e +} diff --git a/pkg/tcpip/link/fdbased/endpoint_unsafe.go b/pkg/tcpip/link/fdbased/endpoint_unsafe.go new file mode 100644 index 000000000..97a477b61 --- /dev/null +++ b/pkg/tcpip/link/fdbased/endpoint_unsafe.go @@ -0,0 +1,32 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux + +package fdbased + +import ( + "reflect" + "unsafe" +) + +const virtioNetHdrSize = int(unsafe.Sizeof(virtioNetHdr{})) + +func vnetHdrToByteSlice(hdr *virtioNetHdr) (slice []byte) { + sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) + sh.Data = uintptr(unsafe.Pointer(hdr)) + sh.Len = virtioNetHdrSize + sh.Cap = virtioNetHdrSize + return +} diff --git a/pkg/tcpip/link/fdbased/fdbased_state_autogen.go b/pkg/tcpip/link/fdbased/fdbased_state_autogen.go new file mode 100755 index 000000000..0555db528 --- /dev/null +++ b/pkg/tcpip/link/fdbased/fdbased_state_autogen.go @@ -0,0 +1,4 @@ +// automatically generated by stateify. + +package fdbased + diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go new file mode 100644 index 000000000..6b7f2a185 --- /dev/null +++ b/pkg/tcpip/link/fdbased/mmap.go @@ -0,0 +1,25 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !linux !amd64 + +package fdbased + +import "gvisor.googlesource.com/gvisor/pkg/tcpip" + +// Stubbed out version for non-linux/non-amd64 platforms. + +func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, *tcpip.Error) { + return nil, nil +} diff --git a/pkg/tcpip/link/fdbased/mmap_amd64.go b/pkg/tcpip/link/fdbased/mmap_amd64.go new file mode 100644 index 000000000..1c2d8c468 --- /dev/null +++ b/pkg/tcpip/link/fdbased/mmap_amd64.go @@ -0,0 +1,194 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux,amd64 + +package fdbased + +import ( + "encoding/binary" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/header" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile" +) + +const ( + tPacketAlignment = uintptr(16) + tpStatusKernel = 0 + tpStatusUser = 1 + tpStatusCopy = 2 + tpStatusLosing = 4 +) + +// We overallocate the frame size to accommodate space for the +// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding. +// +// Memory allocated for the ring buffer: tpBlockSize * tpBlockNR = 2 MiB +// +// NOTE: +// Frames need to be aligned at 16 byte boundaries. +// BlockSize needs to be page aligned. +// +// For details see PACKET_MMAP setting constraints in +// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt +const ( + tpFrameSize = 65536 + 128 + tpBlockSize = tpFrameSize * 32 + tpBlockNR = 1 + tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize +) + +// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct +// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>. +func tPacketAlign(v uintptr) uintptr { + return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1)) +} + +// tPacketReq is the tpacket_req structure as described in +// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt +type tPacketReq struct { + tpBlockSize uint32 + tpBlockNR uint32 + tpFrameSize uint32 + tpFrameNR uint32 +} + +// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h> +type tPacketHdr []byte + +const ( + tpStatusOffset = 0 + tpLenOffset = 8 + tpSnapLenOffset = 12 + tpMacOffset = 16 + tpNetOffset = 18 + tpSecOffset = 20 + tpUSecOffset = 24 +) + +func (t tPacketHdr) tpLen() uint32 { + return binary.LittleEndian.Uint32(t[tpLenOffset:]) +} + +func (t tPacketHdr) tpSnapLen() uint32 { + return binary.LittleEndian.Uint32(t[tpSnapLenOffset:]) +} + +func (t tPacketHdr) tpMac() uint16 { + return binary.LittleEndian.Uint16(t[tpMacOffset:]) +} + +func (t tPacketHdr) tpNet() uint16 { + return binary.LittleEndian.Uint16(t[tpNetOffset:]) +} + +func (t tPacketHdr) tpSec() uint32 { + return binary.LittleEndian.Uint32(t[tpSecOffset:]) +} + +func (t tPacketHdr) tpUSec() uint32 { + return binary.LittleEndian.Uint32(t[tpUSecOffset:]) +} + +func (t tPacketHdr) Payload() []byte { + return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()] +} + +// packetMMapDispatcher uses PACKET_RX_RING's to read/dispatch inbound packets. +// See: mmap_amd64_unsafe.go for implementation details. +type packetMMapDispatcher struct { + // fd is the file descriptor used to send and receive packets. + fd int + + // e is the endpoint this dispatcher is attached to. + e *endpoint + + // ringBuffer is only used when PacketMMap dispatcher is used and points + // to the start of the mmapped PACKET_RX_RING buffer. + ringBuffer []byte + + // ringOffset is the current offset into the ring buffer where the next + // inbound packet will be placed by the kernel. + ringOffset int +} + +func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, *tcpip.Error) { + hdr := tPacketHdr(d.ringBuffer[d.ringOffset*tpFrameSize:]) + for hdr.tpStatus()&tpStatusUser == 0 { + event := rawfile.PollEvent{ + FD: int32(d.fd), + Events: unix.POLLIN | unix.POLLERR, + } + if _, errno := rawfile.BlockingPoll(&event, 1, -1); errno != 0 { + if errno == syscall.EINTR { + continue + } + return nil, rawfile.TranslateErrno(errno) + } + if hdr.tpStatus()&tpStatusCopy != 0 { + // This frame is truncated so skip it after flipping the + // buffer to the kernel. + hdr.setTPStatus(tpStatusKernel) + d.ringOffset = (d.ringOffset + 1) % tpFrameNR + hdr = (tPacketHdr)(d.ringBuffer[d.ringOffset*tpFrameSize:]) + continue + } + } + + // Copy out the packet from the mmapped frame to a locally owned buffer. + pkt := make([]byte, hdr.tpSnapLen()) + copy(pkt, hdr.Payload()) + // Release packet to kernel. + hdr.setTPStatus(tpStatusKernel) + d.ringOffset = (d.ringOffset + 1) % tpFrameNR + return pkt, nil +} + +// dispatch reads packets from an mmaped ring buffer and dispatches them to the +// network stack. +func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) { + pkt, err := d.readMMappedPacket() + if err != nil { + return false, err + } + var ( + p tcpip.NetworkProtocolNumber + remote, local tcpip.LinkAddress + ) + if d.e.hdrSize > 0 { + eth := header.Ethernet(pkt) + p = eth.Type() + remote = eth.SourceAddress() + local = eth.DestinationAddress() + } else { + // We don't get any indication of what the packet is, so try to guess + // if it's an IPv4 or IPv6 packet. + switch header.IPVersion(pkt) { + case header.IPv4Version: + p = header.IPv4ProtocolNumber + case header.IPv6Version: + p = header.IPv6ProtocolNumber + default: + return true, nil + } + } + + pkt = pkt[d.e.hdrSize:] + d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, buffer.NewVectorisedView(len(pkt), []buffer.View{buffer.View(pkt)})) + return true, nil +} diff --git a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go new file mode 100644 index 000000000..47cb1d1cc --- /dev/null +++ b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go @@ -0,0 +1,84 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux,amd64 + +package fdbased + +import ( + "fmt" + "sync/atomic" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" +) + +// tPacketHdrlen is the TPACKET_HDRLEN variable defined in <linux/if_packet.h>. +var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(syscall.RawSockaddrLinklayer{})) + +// tpStatus returns the frame status field. +// The status is concurrently updated by the kernel as a result we must +// use atomic operations to prevent races. +func (t tPacketHdr) tpStatus() uint32 { + hdr := unsafe.Pointer(&t[0]) + statusPtr := unsafe.Pointer(uintptr(hdr) + uintptr(tpStatusOffset)) + return atomic.LoadUint32((*uint32)(statusPtr)) +} + +// setTPStatus set's the frame status to the provided status. +// The status is concurrently updated by the kernel as a result we must +// use atomic operations to prevent races. +func (t tPacketHdr) setTPStatus(status uint32) { + hdr := unsafe.Pointer(&t[0]) + statusPtr := unsafe.Pointer(uintptr(hdr) + uintptr(tpStatusOffset)) + atomic.StoreUint32((*uint32)(statusPtr), status) +} + +func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, error) { + d := &packetMMapDispatcher{ + fd: fd, + e: e, + } + pageSize := unix.Getpagesize() + if tpBlockSize%pageSize != 0 { + return nil, fmt.Errorf("tpBlockSize: %d is not page aligned, pagesize: %d", tpBlockSize, pageSize) + } + tReq := tPacketReq{ + tpBlockSize: uint32(tpBlockSize), + tpBlockNR: uint32(tpBlockNR), + tpFrameSize: uint32(tpFrameSize), + tpFrameNR: uint32(tpFrameNR), + } + // Setup PACKET_RX_RING. + if err := setsockopt(d.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil { + return nil, fmt.Errorf("failed to enable PACKET_RX_RING: %v", err) + } + // Let's mmap the blocks. + sz := tpBlockSize * tpBlockNR + buf, err := syscall.Mmap(d.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) + if err != nil { + return nil, fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err) + } + d.ringBuffer = buf + return d, nil +} + +func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error { + if _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(val), vallen, 0); errno != 0 { + return error(errno) + } + + return nil +} diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go new file mode 100644 index 000000000..1ae0e3359 --- /dev/null +++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go @@ -0,0 +1,309 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux + +package fdbased + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/header" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile" + "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" +) + +// BufConfig defines the shape of the vectorised view used to read packets from the NIC. +var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768} + +// readVDispatcher uses readv() system call to read inbound packets and +// dispatches them. +type readVDispatcher struct { + // fd is the file descriptor used to send and receive packets. + fd int + + // e is the endpoint this dispatcher is attached to. + e *endpoint + + // views are the actual buffers that hold the packet contents. + views []buffer.View + + // iovecs are initialized with base pointers/len of the corresponding + // entries in the views defined above, except when GSO is enabled then + // the first iovec points to a buffer for the vnet header which is + // stripped before the views are passed up the stack for further + // processing. + iovecs []syscall.Iovec +} + +func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { + d := &readVDispatcher{fd: fd, e: e} + d.views = make([]buffer.View, len(BufConfig)) + iovLen := len(BufConfig) + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + iovLen++ + } + d.iovecs = make([]syscall.Iovec, iovLen) + return d, nil +} + +func (d *readVDispatcher) allocateViews(bufConfig []int) { + var vnetHdr [virtioNetHdrSize]byte + vnetHdrOff := 0 + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + // The kernel adds virtioNetHdr before each packet, but + // we don't use it, so so we allocate a buffer for it, + // add it in iovecs but don't add it in a view. + d.iovecs[0] = syscall.Iovec{ + Base: &vnetHdr[0], + Len: uint64(virtioNetHdrSize), + } + vnetHdrOff++ + } + for i := 0; i < len(bufConfig); i++ { + if d.views[i] != nil { + break + } + b := buffer.NewView(bufConfig[i]) + d.views[i] = b + d.iovecs[i+vnetHdrOff] = syscall.Iovec{ + Base: &b[0], + Len: uint64(len(b)), + } + } +} + +func (d *readVDispatcher) capViews(n int, buffers []int) int { + c := 0 + for i, s := range buffers { + c += s + if c >= n { + d.views[i].CapLength(s - (c - n)) + return i + 1 + } + } + return len(buffers) +} + +// dispatch reads one packet from the file descriptor and dispatches it. +func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { + d.allocateViews(BufConfig) + + n, err := rawfile.BlockingReadv(d.fd, d.iovecs) + if err != nil { + return false, err + } + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + // Skip virtioNetHdr which is added before each packet, it + // isn't used and it isn't in a view. + n -= virtioNetHdrSize + } + if n <= d.e.hdrSize { + return false, nil + } + + var ( + p tcpip.NetworkProtocolNumber + remote, local tcpip.LinkAddress + ) + if d.e.hdrSize > 0 { + eth := header.Ethernet(d.views[0]) + p = eth.Type() + remote = eth.SourceAddress() + local = eth.DestinationAddress() + } else { + // We don't get any indication of what the packet is, so try to guess + // if it's an IPv4 or IPv6 packet. + switch header.IPVersion(d.views[0]) { + case header.IPv4Version: + p = header.IPv4ProtocolNumber + case header.IPv6Version: + p = header.IPv6ProtocolNumber + default: + return true, nil + } + } + + used := d.capViews(n, BufConfig) + vv := buffer.NewVectorisedView(n, d.views[:used]) + vv.TrimFront(d.e.hdrSize) + + d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv) + + // Prepare e.views for another packet: release used views. + for i := 0; i < used; i++ { + d.views[i] = nil + } + + return true, nil +} + +// recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and +// dispatches them. +type recvMMsgDispatcher struct { + // fd is the file descriptor used to send and receive packets. + fd int + + // e is the endpoint this dispatcher is attached to. + e *endpoint + + // views is an array of array of buffers that contain packet contents. + views [][]buffer.View + + // iovecs is an array of array of iovec records where each iovec base + // pointer and length are initialzed to the corresponding view above, + // except when GSO is neabled then the first iovec in each array of + // iovecs points to a buffer for the vnet header which is stripped + // before the views are passed up the stack for further processing. + iovecs [][]syscall.Iovec + + // msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to + // reference an array of iovecs in the iovecs field defined above. This + // array is passed as the parameter to recvmmsg call to retrieve + // potentially more than 1 packet per syscall. + msgHdrs []rawfile.MMsgHdr +} + +const ( + // MaxMsgsPerRecv is the maximum number of packets we want to retrieve + // in a single RecvMMsg call. + MaxMsgsPerRecv = 8 +) + +func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) { + d := &recvMMsgDispatcher{ + fd: fd, + e: e, + } + d.views = make([][]buffer.View, MaxMsgsPerRecv) + for i := range d.views { + d.views[i] = make([]buffer.View, len(BufConfig)) + } + d.iovecs = make([][]syscall.Iovec, MaxMsgsPerRecv) + iovLen := len(BufConfig) + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + // virtioNetHdr is prepended before each packet. + iovLen++ + } + for i := range d.iovecs { + d.iovecs[i] = make([]syscall.Iovec, iovLen) + } + d.msgHdrs = make([]rawfile.MMsgHdr, MaxMsgsPerRecv) + for i := range d.msgHdrs { + d.msgHdrs[i].Msg.Iov = &d.iovecs[i][0] + d.msgHdrs[i].Msg.Iovlen = uint64(iovLen) + } + return d, nil +} + +func (d *recvMMsgDispatcher) capViews(k, n int, buffers []int) int { + c := 0 + for i, s := range buffers { + c += s + if c >= n { + d.views[k][i].CapLength(s - (c - n)) + return i + 1 + } + } + return len(buffers) +} + +func (d *recvMMsgDispatcher) allocateViews(bufConfig []int) { + for k := 0; k < len(d.views); k++ { + var vnetHdr [virtioNetHdrSize]byte + vnetHdrOff := 0 + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + // The kernel adds virtioNetHdr before each packet, but + // we don't use it, so so we allocate a buffer for it, + // add it in iovecs but don't add it in a view. + d.iovecs[k][0] = syscall.Iovec{ + Base: &vnetHdr[0], + Len: uint64(virtioNetHdrSize), + } + vnetHdrOff++ + } + for i := 0; i < len(bufConfig); i++ { + if d.views[k][i] != nil { + break + } + b := buffer.NewView(bufConfig[i]) + d.views[k][i] = b + d.iovecs[k][i+vnetHdrOff] = syscall.Iovec{ + Base: &b[0], + Len: uint64(len(b)), + } + } + } +} + +// recvMMsgDispatch reads more than one packet at a time from the file +// descriptor and dispatches it. +func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { + d.allocateViews(BufConfig) + + nMsgs, err := rawfile.BlockingRecvMMsg(d.fd, d.msgHdrs) + if err != nil { + return false, err + } + // Process each of received packets. + for k := 0; k < nMsgs; k++ { + n := int(d.msgHdrs[k].Len) + if d.e.Capabilities()&stack.CapabilityGSO != 0 { + n -= virtioNetHdrSize + } + if n <= d.e.hdrSize { + return false, nil + } + + var ( + p tcpip.NetworkProtocolNumber + remote, local tcpip.LinkAddress + ) + if d.e.hdrSize > 0 { + eth := header.Ethernet(d.views[k][0]) + p = eth.Type() + remote = eth.SourceAddress() + local = eth.DestinationAddress() + } else { + // We don't get any indication of what the packet is, so try to guess + // if it's an IPv4 or IPv6 packet. + switch header.IPVersion(d.views[k][0]) { + case header.IPv4Version: + p = header.IPv4ProtocolNumber + case header.IPv6Version: + p = header.IPv6ProtocolNumber + default: + return true, nil + } + } + + used := d.capViews(k, int(n), BufConfig) + vv := buffer.NewVectorisedView(int(n), d.views[k][:used]) + vv.TrimFront(d.e.hdrSize) + d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, vv) + + // Prepare e.views for another packet: release used views. + for i := 0; i < used; i++ { + d.views[k][i] = nil + } + } + + for k := 0; k < nMsgs; k++ { + d.msgHdrs[k].Len = 0 + } + + return true, nil +} |