diff options
Diffstat (limited to 'pkg/tcpip/link/fdbased')
-rw-r--r-- | pkg/tcpip/link/fdbased/BUILD | 1 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/endpoint.go | 104 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/endpoint_test.go | 160 | ||||
-rw-r--r-- | pkg/tcpip/link/fdbased/endpoint_unsafe.go | 32 |
4 files changed, 228 insertions, 69 deletions
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD index bcf9c023e..50ce91a4e 100644 --- a/pkg/tcpip/link/fdbased/BUILD +++ b/pkg/tcpip/link/fdbased/BUILD @@ -6,6 +6,7 @@ go_library( name = "fdbased", srcs = [ "endpoint.go", + "endpoint_unsafe.go", "mmap.go", "mmap_amd64_unsafe.go", ], diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index d726551b0..20e34c5ee 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -111,6 +111,10 @@ type endpoint struct { // ringOffset is the current offset into the ring buffer where the next // inbound packet will be placed by the kernel. ringOffset int + + // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is + // disabled. + gsoMaxSize uint32 } // Options specify the details about the fd-based endpoint to be created. @@ -123,6 +127,7 @@ type Options struct { Address tcpip.LinkAddress SaveRestore bool DisconnectOk bool + GSOMaxSize uint32 PacketDispatchMode PacketDispatchMode } @@ -165,6 +170,10 @@ func New(opts *Options) tcpip.LinkEndpointID { packetDispatchMode: opts.PacketDispatchMode, } + if opts.GSOMaxSize != 0 && isSocketFD(opts.FD) { + e.caps |= stack.CapabilityGSO + e.gsoMaxSize = opts.GSOMaxSize + } if isSocketFD(opts.FD) && e.packetDispatchMode == PacketMMap { if err := e.setupPacketRXRing(); err != nil { // TODO: replace panic with an error return. @@ -185,17 +194,22 @@ func New(opts *Options) tcpip.LinkEndpointID { } e.views = make([][]buffer.View, msgsPerRecv) - for i, _ := range e.views { + for i := range e.views { e.views[i] = make([]buffer.View, len(BufConfig)) } e.iovecs = make([][]syscall.Iovec, msgsPerRecv) - for i, _ := range e.iovecs { - e.iovecs[i] = make([]syscall.Iovec, len(BufConfig)) + iovLen := len(BufConfig) + if e.Capabilities()&stack.CapabilityGSO != 0 { + // virtioNetHdr is prepended before each packet. + iovLen++ + } + for i := range e.iovecs { + e.iovecs[i] = make([]syscall.Iovec, iovLen) } e.msgHdrs = make([]rawfile.MMsgHdr, msgsPerRecv) - for i, _ := range e.msgHdrs { + for i := range e.msgHdrs { e.msgHdrs[i].Msg.Iov = &e.iovecs[i][0] - e.msgHdrs[i].Msg.Iovlen = uint64(len(BufConfig)) + e.msgHdrs[i].Msg.Iovlen = uint64(iovLen) } return stack.RegisterLinkEndpoint(e) @@ -246,9 +260,27 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress { return e.addr } +// virtioNetHdr is declared in linux/virtio_net.h. +type virtioNetHdr struct { + flags uint8 + gsoType uint8 + hdrLen uint16 + gsoSize uint16 + csumStart uint16 + csumOffset uint16 +} + +// These constants are declared in linux/virtio_net.h. +const ( + _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 + + _VIRTIO_NET_HDR_GSO_TCPV4 = 1 + _VIRTIO_NET_HDR_GSO_TCPV6 = 4 +) + // WritePacket writes outbound packets to the file descriptor. If it is not // currently writable, the packet is dropped. -func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { +func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { if e.hdrSize > 0 { // Add ethernet header if needed. eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize)) @@ -266,11 +298,37 @@ func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload b eth.Encode(ethHdr) } + if e.Capabilities()&stack.CapabilityGSO != 0 { + vnetHdr := virtioNetHdr{} + vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr) + if gso != nil { + vnetHdr.hdrLen = uint16(hdr.UsedLength()) + if gso.NeedsCsum { + vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM + vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen + vnetHdr.csumOffset = gso.CsumOffset + } + if gso.Type != stack.GSONone && uint16(payload.Size()) > gso.MSS { + switch gso.Type { + case stack.GSOTCPv4: + vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 + case stack.GSOTCPv6: + vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 + default: + panic(fmt.Sprintf("Unknown gso type: %v", gso.Type)) + } + vnetHdr.gsoSize = gso.MSS + } + } + + return rawfile.NonBlockingWrite3(e.fd, vnetHdrBuf, hdr.View(), payload.ToView()) + } + if payload.Size() == 0 { return rawfile.NonBlockingWrite(e.fd, hdr.View()) } - return rawfile.NonBlockingWrite2(e.fd, hdr.View(), payload.ToView()) + return rawfile.NonBlockingWrite3(e.fd, hdr.View(), payload.ToView(), nil) } // WriteRawPacket writes a raw packet directly to the file descriptor. @@ -292,13 +350,25 @@ func (e *endpoint) capViews(k, n int, buffers []int) int { func (e *endpoint) allocateViews(bufConfig []int) { for k := 0; k < len(e.views); k++ { + var vnetHdr [virtioNetHdrSize]byte + vnetHdrOff := 0 + if e.Capabilities()&stack.CapabilityGSO != 0 { + // The kernel adds virtioNetHdr before each packet, but + // we don't use it, so so we allocate a buffer for it, + // add it in iovecs but don't add it in a view. + e.iovecs[k][0] = syscall.Iovec{ + Base: &vnetHdr[0], + Len: uint64(virtioNetHdrSize), + } + vnetHdrOff++ + } for i := 0; i < len(bufConfig); i++ { if e.views[k][i] != nil { break } b := buffer.NewView(bufConfig[i]) e.views[k][i] = b - e.iovecs[k][i] = syscall.Iovec{ + e.iovecs[k][i+vnetHdrOff] = syscall.Iovec{ Base: &b[0], Len: uint64(len(b)), } @@ -314,7 +384,11 @@ func (e *endpoint) dispatch() (bool, *tcpip.Error) { if err != nil { return false, err } - + if e.Capabilities()&stack.CapabilityGSO != 0 { + // Skip virtioNetHdr which is added before each packet, it + // isn't used and it isn't in a view. + n -= virtioNetHdrSize + } if n <= e.hdrSize { return false, nil } @@ -366,8 +440,11 @@ func (e *endpoint) recvMMsgDispatch() (bool, *tcpip.Error) { } // Process each of received packets. for k := 0; k < nMsgs; k++ { - n := e.msgHdrs[k].Len - if n <= uint32(e.hdrSize) { + n := int(e.msgHdrs[k].Len) + if e.Capabilities()&stack.CapabilityGSO != 0 { + n -= virtioNetHdrSize + } + if n <= e.hdrSize { return false, nil } @@ -425,6 +502,11 @@ func (e *endpoint) dispatchLoop() *tcpip.Error { } } +// GSOMaxSize returns the maximum GSO packet size. +func (e *endpoint) GSOMaxSize() uint32 { + return e.gsoMaxSize +} + // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes // to the FD, but does not read from it. All reads come from injected packets. type InjectableEndpoint struct { diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go index 14abacdf2..ecc5b73f3 100644 --- a/pkg/tcpip/link/fdbased/endpoint_test.go +++ b/pkg/tcpip/link/fdbased/endpoint_test.go @@ -24,6 +24,7 @@ import ( "syscall" "testing" "time" + "unsafe" "gvisor.googlesource.com/gvisor/pkg/tcpip" "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" @@ -33,10 +34,12 @@ import ( ) const ( - mtu = 1500 - laddr = tcpip.LinkAddress("\x11\x22\x33\x44\x55\x66") - raddr = tcpip.LinkAddress("\x77\x88\x99\xaa\xbb\xcc") - proto = 10 + mtu = 1500 + laddr = tcpip.LinkAddress("\x11\x22\x33\x44\x55\x66") + raddr = tcpip.LinkAddress("\x77\x88\x99\xaa\xbb\xcc") + proto = 10 + csumOffset = 48 + gsoMSS = 500 ) type packetInfo struct { @@ -130,67 +133,108 @@ func TestAddress(t *testing.T) { } } -func TestWritePacket(t *testing.T) { - lengths := []int{0, 100, 1000} - eths := []bool{true, false} +func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32) { + c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: eth, GSOMaxSize: gsoMaxSize}) + defer c.cleanup() - for _, eth := range eths { - for _, plen := range lengths { - t.Run(fmt.Sprintf("Eth=%v,PayloadLen=%v", eth, plen), func(t *testing.T) { - c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: eth}) - defer c.cleanup() + r := &stack.Route{ + RemoteLinkAddress: raddr, + } - r := &stack.Route{ - RemoteLinkAddress: raddr, - } + // Build header. + hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()) + 100) + b := hdr.Prepend(100) + for i := range b { + b[i] = uint8(rand.Intn(256)) + } - // Build header. - hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()) + 100) - b := hdr.Prepend(100) - for i := range b { - b[i] = uint8(rand.Intn(256)) - } + // Build payload and write. + payload := make(buffer.View, plen) + for i := range payload { + payload[i] = uint8(rand.Intn(256)) + } + want := append(hdr.View(), payload...) + var gso *stack.GSO + if gsoMaxSize != 0 { + gso = &stack.GSO{ + Type: stack.GSOTCPv6, + NeedsCsum: true, + CsumOffset: csumOffset, + MSS: gsoMSS, + MaxSize: gsoMaxSize, + } + } + if err := c.ep.WritePacket(r, gso, hdr, payload.ToVectorisedView(), proto); err != nil { + t.Fatalf("WritePacket failed: %v", err) + } - // Build payload and write. - payload := make(buffer.View, plen) - for i := range payload { - payload[i] = uint8(rand.Intn(256)) - } - want := append(hdr.View(), payload...) - if err := c.ep.WritePacket(r, hdr, payload.ToVectorisedView(), proto); err != nil { - t.Fatalf("WritePacket failed: %v", err) - } + // Read from fd, then compare with what we wrote. + b = make([]byte, mtu) + n, err := syscall.Read(c.fds[0], b) + if err != nil { + t.Fatalf("Read failed: %v", err) + } + b = b[:n] + if gsoMaxSize != 0 { + vnetHdr := *(*virtioNetHdr)(unsafe.Pointer(&b[0])) + if vnetHdr.flags&_VIRTIO_NET_HDR_F_NEEDS_CSUM == 0 { + t.Fatalf("virtioNetHdr.flags %v doesn't contain %v", vnetHdr.flags, _VIRTIO_NET_HDR_F_NEEDS_CSUM) + } + csumStart := header.EthernetMinimumSize + gso.L3HdrLen + if vnetHdr.csumStart != csumStart { + t.Fatalf("vnetHdr.csumStart = %v, want %v", vnetHdr.csumStart, csumStart) + } + if vnetHdr.csumOffset != csumOffset { + t.Fatalf("vnetHdr.csumOffset = %v, want %v", vnetHdr.csumOffset, csumOffset) + } + gsoType := uint8(0) + if int(gso.MSS) < plen { + gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 + } + if vnetHdr.gsoType != gsoType { + t.Fatalf("vnetHdr.gsoType = %v, want %v", vnetHdr.gsoType, gsoType) + } + b = b[virtioNetHdrSize:] + } + if eth { + h := header.Ethernet(b) + b = b[header.EthernetMinimumSize:] - // Read from fd, then compare with what we wrote. - b = make([]byte, mtu) - n, err := syscall.Read(c.fds[0], b) - if err != nil { - t.Fatalf("Read failed: %v", err) - } - b = b[:n] - if eth { - h := header.Ethernet(b) - b = b[header.EthernetMinimumSize:] + if a := h.SourceAddress(); a != laddr { + t.Fatalf("SourceAddress() = %v, want %v", a, laddr) + } - if a := h.SourceAddress(); a != laddr { - t.Fatalf("SourceAddress() = %v, want %v", a, laddr) - } + if a := h.DestinationAddress(); a != raddr { + t.Fatalf("DestinationAddress() = %v, want %v", a, raddr) + } - if a := h.DestinationAddress(); a != raddr { - t.Fatalf("DestinationAddress() = %v, want %v", a, raddr) - } + if et := h.Type(); et != proto { + t.Fatalf("Type() = %v, want %v", et, proto) + } + } + if len(b) != len(want) { + t.Fatalf("Read returned %v bytes, want %v", len(b), len(want)) + } + if !bytes.Equal(b, want) { + t.Fatalf("Read returned %x, want %x", b, want) + } +} - if et := h.Type(); et != proto { - t.Fatalf("Type() = %v, want %v", et, proto) - } - } - if len(b) != len(want) { - t.Fatalf("Read returned %v bytes, want %v", len(b), len(want)) - } - if !bytes.Equal(b, want) { - t.Fatalf("Read returned %x, want %x", b, want) - } - }) +func TestWritePacket(t *testing.T) { + lengths := []int{0, 100, 1000} + eths := []bool{true, false} + gsos := []uint32{0, 32768} + + for _, eth := range eths { + for _, plen := range lengths { + for _, gso := range gsos { + t.Run( + fmt.Sprintf("Eth=%v,PayloadLen=%v,GSOMaxSize=%v", eth, plen, gso), + func(t *testing.T) { + testWritePacket(t, plen, eth, gso) + }, + ) + } } } } @@ -210,7 +254,7 @@ func TestPreserveSrcAddress(t *testing.T) { // WritePacket panics given a prependable with anything less than // the minimum size of the ethernet header. hdr := buffer.NewPrependable(header.EthernetMinimumSize) - if err := c.ep.WritePacket(r, hdr, buffer.VectorisedView{}, proto); err != nil { + if err := c.ep.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, proto); err != nil { t.Fatalf("WritePacket failed: %v", err) } diff --git a/pkg/tcpip/link/fdbased/endpoint_unsafe.go b/pkg/tcpip/link/fdbased/endpoint_unsafe.go new file mode 100644 index 000000000..36e7fe5a9 --- /dev/null +++ b/pkg/tcpip/link/fdbased/endpoint_unsafe.go @@ -0,0 +1,32 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux + +package fdbased + +import ( + "reflect" + "unsafe" +) + +const virtioNetHdrSize = int(unsafe.Sizeof(virtioNetHdr{})) + +func vnetHdrToByteSlice(hdr *virtioNetHdr) (slice []byte) { + sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) + sh.Data = uintptr(unsafe.Pointer(hdr)) + sh.Len = virtioNetHdrSize + sh.Cap = virtioNetHdrSize + return +} |