From 8720bd643ef37a693984baba83c37458f6691df1 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 22 Oct 2019 11:54:14 -0700 Subject: netstack/tcp: software segmentation offload Right now, we send each tcp packet separately, we call one system call per-packet. This patch allows to generate multiple tcp packets and send them by sendmmsg. The arguable part of this CL is a way how to handle multiple headers. This CL adds the next field to the Prepandable buffer. Nginx test results: Server Software: nginx/1.15.9 Server Hostname: 10.138.0.2 Server Port: 8080 Document Path: /10m.txt Document Length: 10485760 bytes w/o gso: Concurrency Level: 5 Time taken for tests: 5.491 seconds Complete requests: 100 Failed requests: 0 Total transferred: 1048600200 bytes HTML transferred: 1048576000 bytes Requests per second: 18.21 [#/sec] (mean) Time per request: 274.525 [ms] (mean) Time per request: 54.905 [ms] (mean, across all concurrent requests) Transfer rate: 186508.03 [Kbytes/sec] received sw-gso: Concurrency Level: 5 Time taken for tests: 3.852 seconds Complete requests: 100 Failed requests: 0 Total transferred: 1048600200 bytes HTML transferred: 1048576000 bytes Requests per second: 25.96 [#/sec] (mean) Time per request: 192.576 [ms] (mean) Time per request: 38.515 [ms] (mean, across all concurrent requests) Transfer rate: 265874.92 [Kbytes/sec] received w/o gso: $ ./tcp_benchmark --client --duration 15 --ideal [SUM] 0.0-15.1 sec 2.20 GBytes 1.25 Gbits/sec software gso: $ tcp_benchmark --client --duration 15 --ideal --gso $((1<<16)) --swgso [SUM] 0.0-15.1 sec 3.99 GBytes 2.26 Gbits/sec PiperOrigin-RevId: 276112677 --- kokoro/swgso_tests.cfg | 9 ++ pkg/tcpip/buffer/prependable.go | 5 ++ pkg/tcpip/header/BUILD | 6 +- pkg/tcpip/header/checksum.go | 50 ++++++++--- pkg/tcpip/header/checksum_test.go | 109 +++++++++++++++++++++++ pkg/tcpip/link/channel/channel.go | 29 +++++- pkg/tcpip/link/fdbased/endpoint.go | 128 ++++++++++++++++++++++++++- pkg/tcpip/link/fdbased/packet_dispatchers.go | 12 +-- pkg/tcpip/link/loopback/loopback.go | 5 ++ pkg/tcpip/link/muxed/injectable.go | 15 +++- pkg/tcpip/link/rawfile/BUILD | 5 +- pkg/tcpip/link/rawfile/rawfile_unsafe.go | 11 +++ pkg/tcpip/link/sharedmem/sharedmem.go | 5 ++ pkg/tcpip/link/sniffer/sniffer.go | 23 ++++- pkg/tcpip/link/waitable/waitable.go | 13 +++ pkg/tcpip/link/waitable/waitable_test.go | 6 ++ pkg/tcpip/network/arp/arp.go | 5 ++ pkg/tcpip/network/ip_test.go | 5 ++ pkg/tcpip/network/ipv4/ipv4.go | 27 +++++- pkg/tcpip/network/ipv6/ipv6.go | 30 ++++++- pkg/tcpip/stack/registration.go | 28 +++++- pkg/tcpip/stack/route.go | 48 ++++++++-- pkg/tcpip/stack/stack_test.go | 5 ++ pkg/tcpip/transport/tcp/BUILD | 1 + pkg/tcpip/transport/tcp/connect.go | 77 +++++++++++++--- pkg/tcpip/transport/tcp/endpoint.go | 18 ++-- runsc/boot/config.go | 9 +- runsc/boot/filter/config.go | 9 ++ runsc/boot/network.go | 14 +-- runsc/main.go | 6 +- runsc/sandbox/BUILD | 1 + runsc/sandbox/network.go | 12 ++- scripts/swgso_tests.sh | 21 +++++ 33 files changed, 676 insertions(+), 71 deletions(-) create mode 100644 kokoro/swgso_tests.cfg create mode 100644 pkg/tcpip/header/checksum_test.go create mode 100755 scripts/swgso_tests.sh diff --git a/kokoro/swgso_tests.cfg b/kokoro/swgso_tests.cfg new file mode 100644 index 000000000..101a9c607 --- /dev/null +++ b/kokoro/swgso_tests.cfg @@ -0,0 +1,9 @@ +build_file: "repo/scripts/swgso_tests.sh" + +action { + define_artifacts { + regex: "**/sponge_log.xml" + regex: "**/sponge_log.log" + regex: "**/outputs.zip" + } +} diff --git a/pkg/tcpip/buffer/prependable.go b/pkg/tcpip/buffer/prependable.go index 4287464f3..48a2a2713 100644 --- a/pkg/tcpip/buffer/prependable.go +++ b/pkg/tcpip/buffer/prependable.go @@ -41,6 +41,11 @@ func NewPrependableFromView(v View) Prependable { return Prependable{buf: v, usedIdx: 0} } +// NewEmptyPrependableFromView creates a new prependable buffer from a View. +func NewEmptyPrependableFromView(v View) Prependable { + return Prependable{buf: v, usedIdx: len(v)} +} + // View returns a View of the backing buffer that contains all prepended // data so far. func (p Prependable) View() View { diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD index de3c7e2c3..7b3189303 100644 --- a/pkg/tcpip/header/BUILD +++ b/pkg/tcpip/header/BUILD @@ -36,10 +36,14 @@ go_test( name = "header_x_test", size = "small", srcs = [ + "checksum_test.go", "ipversion_test.go", "tcp_test.go", ], - deps = [":header"], + deps = [ + ":header", + "//pkg/tcpip/buffer", + ], ) go_test( diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go index 39a4d69be..9749c7f4d 100644 --- a/pkg/tcpip/header/checksum.go +++ b/pkg/tcpip/header/checksum.go @@ -23,11 +23,17 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/buffer" ) -func calculateChecksum(buf []byte, initial uint32) uint16 { +func calculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bool) { v := initial + if odd { + v += uint32(buf[0]) + buf = buf[1:] + } + l := len(buf) - if l&1 != 0 { + odd = l&1 != 0 + if odd { l-- v += uint32(buf[l]) << 8 } @@ -36,7 +42,7 @@ func calculateChecksum(buf []byte, initial uint32) uint16 { v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) } - return ChecksumCombine(uint16(v), uint16(v>>16)) + return ChecksumCombine(uint16(v), uint16(v>>16)), odd } // Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the @@ -44,7 +50,8 @@ func calculateChecksum(buf []byte, initial uint32) uint16 { // // The initial checksum must have been computed on an even number of bytes. func Checksum(buf []byte, initial uint16) uint16 { - return calculateChecksum(buf, uint32(initial)) + s, _ := calculateChecksum(buf, false, uint32(initial)) + return s } // ChecksumVV calculates the checksum (as defined in RFC 1071) of the bytes in @@ -52,19 +59,40 @@ func Checksum(buf []byte, initial uint16) uint16 { // // The initial checksum must have been computed on an even number of bytes. func ChecksumVV(vv buffer.VectorisedView, initial uint16) uint16 { - var odd bool + return ChecksumVVWithOffset(vv, initial, 0, vv.Size()) +} + +// ChecksumVVWithOffset calculates the checksum (as defined in RFC 1071) of the +// bytes in the given VectorizedView. +// +// The initial checksum must have been computed on an even number of bytes. +func ChecksumVVWithOffset(vv buffer.VectorisedView, initial uint16, off int, size int) uint16 { + odd := false sum := initial for _, v := range vv.Views() { if len(v) == 0 { continue } - s := uint32(sum) - if odd { - s += uint32(v[0]) - v = v[1:] + + if off >= len(v) { + off -= len(v) + continue + } + v = v[off:] + + l := len(v) + if l > size { + l = size + } + v = v[:l] + + sum, odd = calculateChecksum(v, odd, uint32(sum)) + + size -= len(v) + if size == 0 { + break } - odd = len(v)&1 != 0 - sum = calculateChecksum(v, s) + off = 0 } return sum } diff --git a/pkg/tcpip/header/checksum_test.go b/pkg/tcpip/header/checksum_test.go new file mode 100644 index 000000000..86b466c1c --- /dev/null +++ b/pkg/tcpip/header/checksum_test.go @@ -0,0 +1,109 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package header provides the implementation of the encoding and decoding of +// network protocol headers. +package header_test + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" +) + +func TestChecksumVVWithOffset(t *testing.T) { + testCases := []struct { + name string + vv buffer.VectorisedView + off, size int + initial uint16 + want uint16 + }{ + { + name: "empty", + vv: buffer.NewVectorisedView(0, []buffer.View{ + buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}), + }), + off: 0, + size: 0, + want: 0, + }, + { + name: "OneView", + vv: buffer.NewVectorisedView(0, []buffer.View{ + buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}), + }), + off: 0, + size: 5, + want: 1294, + }, + { + name: "TwoViews", + vv: buffer.NewVectorisedView(0, []buffer.View{ + buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}), + buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}), + }), + off: 0, + size: 11, + want: 33819, + }, + { + name: "TwoViewsWithOffset", + vv: buffer.NewVectorisedView(0, []buffer.View{ + buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}), + buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}), + }), + off: 1, + size: 11, + want: 33819, + }, + { + name: "ThreeViewsWithOffset", + vv: buffer.NewVectorisedView(0, []buffer.View{ + buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}), + buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}), + buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}), + }), + off: 7, + size: 11, + want: 33819, + }, + { + name: "ThreeViewsWithInitial", + vv: buffer.NewVectorisedView(0, []buffer.View{ + buffer.NewViewFromBytes([]byte{77, 11, 33, 0, 55, 44}), + buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}), + buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123, 99}), + }), + initial: 77, + off: 7, + size: 11, + want: 33896, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if got, want := header.ChecksumVVWithOffset(tc.vv, tc.initial, tc.off, tc.size), tc.want; got != want { + t.Errorf("header.ChecksumVVWithOffset(%v) = %v, want: %v", tc, got, tc.want) + } + v := tc.vv.ToView() + v.TrimFront(tc.off) + v.CapLength(tc.size) + if got, want := header.Checksum(v, tc.initial), tc.want; got != want { + t.Errorf("header.Checksum(%v) = %v, want: %v", tc, got, tc.want) + } + }) + } +} diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go index 95c6f0828..14f197a77 100644 --- a/pkg/tcpip/link/channel/channel.go +++ b/pkg/tcpip/link/channel/channel.go @@ -96,7 +96,7 @@ func (e *Endpoint) MTU() uint32 { func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities { caps := stack.LinkEndpointCapabilities(0) if e.GSO { - caps |= stack.CapabilityGSO + caps |= stack.CapabilityHardwareGSO } return caps } @@ -134,6 +134,33 @@ func (e *Endpoint) WritePacket(_ *stack.Route, gso *stack.GSO, hdr buffer.Prepen return nil } +// WritePackets stores outbound packets into the channel. +func (e *Endpoint) WritePackets(_ *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + payloadView := payload.ToView() + n := 0 +packetLoop: + for i := range hdrs { + hdr := &hdrs[i].Hdr + off := hdrs[i].Off + size := hdrs[i].Size + p := PacketInfo{ + Header: hdr.View(), + Proto: protocol, + Payload: buffer.NewViewFromBytes(payloadView[off : off+size]), + GSO: gso, + } + + select { + case e.C <- p: + n++ + default: + break packetLoop + } + } + + return n, nil +} + // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. func (e *Endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { p := PacketInfo{ diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 255220d1a..ae4858529 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -165,6 +165,9 @@ type Options struct { // disabled. GSOMaxSize uint32 + // SoftwareGSOEnabled indicates whether software GSO is enabled or not. + SoftwareGSOEnabled bool + // PacketDispatchMode specifies the type of inbound dispatcher to be // used for this endpoint. PacketDispatchMode PacketDispatchMode @@ -242,7 +245,11 @@ func New(opts *Options) (stack.LinkEndpoint, error) { } if isSocket { if opts.GSOMaxSize != 0 { - e.caps |= stack.CapabilityGSO + if opts.SoftwareGSOEnabled { + e.caps |= stack.CapabilitySoftwareGSO + } else { + e.caps |= stack.CapabilityHardwareGSO + } e.gsoMaxSize = opts.GSOMaxSize } } @@ -397,7 +404,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen eth.Encode(ethHdr) } - if e.Capabilities()&stack.CapabilityGSO != 0 { + if e.Capabilities()&stack.CapabilityHardwareGSO != 0 { vnetHdr := virtioNetHdr{} vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr) if gso != nil { @@ -430,6 +437,123 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil) } +// WritePackets writes outbound packets to the file descriptor. If it is not +// currently writable, the packet is dropped. +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + var ethHdrBuf []byte + // hdr + data + iovLen := 2 + if e.hdrSize > 0 { + // Add ethernet header if needed. + ethHdrBuf = make([]byte, header.EthernetMinimumSize) + eth := header.Ethernet(ethHdrBuf) + ethHdr := &header.EthernetFields{ + DstAddr: r.RemoteLinkAddress, + Type: protocol, + } + + // Preserve the src address if it's set in the route. + if r.LocalLinkAddress != "" { + ethHdr.SrcAddr = r.LocalLinkAddress + } else { + ethHdr.SrcAddr = e.addr + } + eth.Encode(ethHdr) + iovLen++ + } + + n := len(hdrs) + + views := payload.Views() + /* + * Each bondary in views can add one more iovec. + * + * payload | | | | + * ----------------------------- + * packets | | | | | | | + * ----------------------------- + * iovecs | | | | | | | | | + */ + iovec := make([]syscall.Iovec, n*iovLen+len(views)-1) + mmsgHdrs := make([]rawfile.MMsgHdr, n) + + iovecIdx := 0 + viewIdx := 0 + viewOff := 0 + off := 0 + nextOff := 0 + for i := range hdrs { + prevIovecIdx := iovecIdx + mmsgHdr := &mmsgHdrs[i] + mmsgHdr.Msg.Iov = &iovec[iovecIdx] + packetSize := hdrs[i].Size + hdr := &hdrs[i].Hdr + + off = hdrs[i].Off + if off != nextOff { + // We stop in a different point last time. + size := packetSize + viewIdx = 0 + viewOff = 0 + for size > 0 { + if size >= len(views[viewIdx]) { + viewIdx++ + viewOff = 0 + size -= len(views[viewIdx]) + } else { + viewOff = size + size = 0 + } + } + } + nextOff = off + packetSize + + if ethHdrBuf != nil { + v := &iovec[iovecIdx] + v.Base = ðHdrBuf[0] + v.Len = uint64(len(ethHdrBuf)) + iovecIdx++ + } + + v := &iovec[iovecIdx] + hdrView := hdr.View() + v.Base = &hdrView[0] + v.Len = uint64(len(hdrView)) + iovecIdx++ + + for packetSize > 0 { + vec := &iovec[iovecIdx] + iovecIdx++ + + v := views[viewIdx] + vec.Base = &v[viewOff] + s := len(v) - viewOff + if s <= packetSize { + viewIdx++ + viewOff = 0 + } else { + s = packetSize + viewOff += s + } + vec.Len = uint64(s) + packetSize -= s + } + + mmsgHdr.Msg.Iovlen = uint64(iovecIdx - prevIovecIdx) + } + + packets := 0 + for packets < n { + sent, err := rawfile.NonBlockingSendMMsg(e.fds[0], mmsgHdrs) + if err != nil { + return packets, err + } + packets += sent + mmsgHdrs = mmsgHdrs[sent:] + } + return packets, nil +} + // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { return rawfile.NonBlockingWrite(e.fds[0], packet.ToView()) diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go index 9fb760891..12168a1dc 100644 --- a/pkg/tcpip/link/fdbased/packet_dispatchers.go +++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go @@ -53,7 +53,7 @@ func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { d := &readVDispatcher{fd: fd, e: e} d.views = make([]buffer.View, len(BufConfig)) iovLen := len(BufConfig) - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { iovLen++ } d.iovecs = make([]syscall.Iovec, iovLen) @@ -63,7 +63,7 @@ func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { func (d *readVDispatcher) allocateViews(bufConfig []int) { var vnetHdr [virtioNetHdrSize]byte vnetHdrOff := 0 - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { // The kernel adds virtioNetHdr before each packet, but // we don't use it, so so we allocate a buffer for it, // add it in iovecs but don't add it in a view. @@ -106,7 +106,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { if err != nil { return false, err } - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { // Skip virtioNetHdr which is added before each packet, it // isn't used and it isn't in a view. n -= virtioNetHdrSize @@ -195,7 +195,7 @@ func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) { } d.iovecs = make([][]syscall.Iovec, MaxMsgsPerRecv) iovLen := len(BufConfig) - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { // virtioNetHdr is prepended before each packet. iovLen++ } @@ -226,7 +226,7 @@ func (d *recvMMsgDispatcher) allocateViews(bufConfig []int) { for k := 0; k < len(d.views); k++ { var vnetHdr [virtioNetHdrSize]byte vnetHdrOff := 0 - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { // The kernel adds virtioNetHdr before each packet, but // we don't use it, so so we allocate a buffer for it, // add it in iovecs but don't add it in a view. @@ -262,7 +262,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { // Process each of received packets. for k := 0; k < nMsgs; k++ { n := int(d.msgHdrs[k].Len) - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { n -= virtioNetHdrSize } if n <= d.e.hdrSize { diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go index 5f5bacf27..a3b48fa73 100644 --- a/pkg/tcpip/link/loopback/loopback.go +++ b/pkg/tcpip/link/loopback/loopback.go @@ -90,6 +90,11 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prependa return nil } +// WritePackets implements stack.LinkEndpoint.WritePackets. +func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + panic("not implemented") +} + // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { // Reject the packet if it's shorter than an ethernet header. diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go index 41524e271..682b60291 100644 --- a/pkg/tcpip/link/muxed/injectable.go +++ b/pkg/tcpip/link/muxed/injectable.go @@ -84,12 +84,23 @@ func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, vv, nil /* linkHeader */) } +// WritePackets writes outbound packets to the appropriate +// LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if +// r.RemoteAddress has a route registered in this endpoint. +func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + endpoint, ok := m.routes[r.RemoteAddress] + if !ok { + return 0, tcpip.ErrNoRoute + } + return endpoint.WritePackets(r, gso, hdrs, payload, protocol) +} + // WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint // based on the RemoteAddress. HandleLocal only works if r.RemoteAddress has a // route registered in this endpoint. -func (m *InjectableEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { +func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { if endpoint, ok := m.routes[r.RemoteAddress]; ok { - return endpoint.WritePacket(r, nil /* gso */, hdr, payload, protocol) + return endpoint.WritePacket(r, gso, hdr, payload, protocol) } return tcpip.ErrNoRoute } diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD index 2e8bc772a..05c7b8024 100644 --- a/pkg/tcpip/link/rawfile/BUILD +++ b/pkg/tcpip/link/rawfile/BUILD @@ -16,5 +16,8 @@ go_library( visibility = [ "//visibility:public", ], - deps = ["//pkg/tcpip"], + deps = [ + "//pkg/tcpip", + "@org_golang_x_sys//unix:go_default_library", + ], ) diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go index 7e286a3a6..44e25d475 100644 --- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go +++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go @@ -22,6 +22,7 @@ import ( "syscall" "unsafe" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/tcpip" ) @@ -101,6 +102,16 @@ func NonBlockingWrite3(fd int, b1, b2, b3 []byte) *tcpip.Error { return nil } +// NonBlockingSendMMsg sends multiple messages on a socket. +func NonBlockingSendMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) { + n, _, e := syscall.RawSyscall6(unix.SYS_SENDMMSG, uintptr(fd), uintptr(unsafe.Pointer(&msgHdrs[0])), uintptr(len(msgHdrs)), syscall.MSG_DONTWAIT, 0, 0) + if e != 0 { + return 0, TranslateErrno(e) + } + + return int(n), nil +} + // PollEvent represents the pollfd structure passed to a poll() system call. type PollEvent struct { FD int32 diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go index 4393403c5..279e2b457 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem.go +++ b/pkg/tcpip/link/sharedmem/sharedmem.go @@ -212,6 +212,11 @@ func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.Prependa return nil } +// WritePackets implements stack.LinkEndpoint.WritePackets. +func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + panic("not implemented") +} + // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { v := packet.ToView() diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go index de9ff98cb..39757ea2a 100644 --- a/pkg/tcpip/link/sniffer/sniffer.go +++ b/pkg/tcpip/link/sniffer/sniffer.go @@ -193,10 +193,7 @@ func (e *endpoint) GSOMaxSize() uint32 { return 0 } -// WritePacket implements the stack.LinkEndpoint interface. It is called by -// higher-level protocols to write packets; it just logs the packet and forwards -// the request to the lower endpoint. -func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { +func (e *endpoint) dumpPacket(gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) { if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil { logPacket("send", protocol, hdr.View(), gso) } @@ -223,9 +220,27 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen panic(err) } } +} + +// WritePacket implements the stack.LinkEndpoint interface. It is called by +// higher-level protocols to write packets; it just logs the packet and +// forwards the request to the lower endpoint. +func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { + e.dumpPacket(gso, hdr, payload, protocol) return e.lower.WritePacket(r, gso, hdr, payload, protocol) } +// WritePackets implements the stack.LinkEndpoint interface. It is called by +// higher-level protocols to write packets; it just logs the packet and +// forwards the request to the lower endpoint. +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + view := payload.ToView() + for _, d := range hdrs { + e.dumpPacket(gso, d.Hdr, buffer.NewVectorisedView(d.Size, []buffer.View{view[d.Off:][:d.Size]}), protocol) + } + return e.lower.WritePackets(r, gso, hdrs, payload, protocol) +} + // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil { diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go index e9b50b802..a04fc1062 100644 --- a/pkg/tcpip/link/waitable/waitable.go +++ b/pkg/tcpip/link/waitable/waitable.go @@ -109,6 +109,19 @@ func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return err } +// WritePackets implements stack.LinkEndpoint.WritePackets. It is called by +// higher-level protocols to write packets. It only forwards packets to the +// lower endpoint if Wait or WaitWrite haven't been called. +func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + if !e.writeGate.Enter() { + return len(hdrs), nil + } + + n, err := e.lower.WritePackets(r, gso, hdrs, payload, protocol) + e.writeGate.Leave() + return n, err +} + // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. func (e *Endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { if !e.writeGate.Enter() { diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go index f03bbceec..5f0f8fa2d 100644 --- a/pkg/tcpip/link/waitable/waitable_test.go +++ b/pkg/tcpip/link/waitable/waitable_test.go @@ -70,6 +70,12 @@ func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, hdr buffer.P return nil } +// WritePackets implements stack.LinkEndpoint.WritePackets. +func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + e.writeCount += len(hdrs) + return len(hdrs), nil +} + func (e *countedEndpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { e.writeCount++ return nil diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index 6b1e854dc..46178459e 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -83,6 +83,11 @@ func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, buffer.Prependable, buf return tcpip.ErrNotSupported } +// WritePackets implements stack.NetworkEndpoint.WritePackets. +func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []stack.PacketDescriptor, buffer.VectorisedView, stack.NetworkHeaderParams, stack.PacketLooping) (int, *tcpip.Error) { + return 0, tcpip.ErrNotSupported +} + func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { return tcpip.ErrNotSupported } diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go index bd09b9a1c..8d74497ba 100644 --- a/pkg/tcpip/network/ip_test.go +++ b/pkg/tcpip/network/ip_test.go @@ -171,6 +171,11 @@ func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, hdr buffer.Prepen return nil } +// WritePackets implements stack.LinkEndpoint.WritePackets. +func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, hdr []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + panic("not implemented") +} + func (t *testObject) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error { return tcpip.ErrNotSupported } diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index 5cd895ff0..1339f8474 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -198,10 +198,9 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, hdr buff return nil } -// WritePacket writes a packet to the given destination address and protocol. -func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error { +func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) { ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize)) - length := uint16(hdr.UsedLength() + payload.Size()) + length := uint16(hdr.UsedLength() + payloadSize) id := uint32(0) if length > header.IPv4MaximumHeaderSize+8 { // Packets of 68 bytes or less are required by RFC 791 to not be @@ -219,6 +218,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen DstAddr: r.RemoteAddress, }) ip.SetChecksum(^ip.CalculateChecksum()) +} + +// WritePacket writes a packet to the given destination address and protocol. +func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error { + e.addIPHeader(r, &hdr, payload.Size(), params) if loop&stack.PacketLoop != 0 { views := make([]buffer.View, 1, 1+len(payload.Views())) @@ -242,6 +246,23 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return nil } +// WritePackets implements stack.NetworkEndpoint.WritePackets. +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) { + if loop&stack.PacketLoop != 0 { + panic("multiple packets in local loop") + } + if loop&stack.PacketOut == 0 { + return len(hdrs), nil + } + + for i := range hdrs { + e.addIPHeader(r, &hdrs[i].Hdr, hdrs[i].Size, params) + } + n, err := e.linkEP.WritePackets(r, gso, hdrs, payload, ProtocolNumber) + r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) + return n, err +} + // WriteHeaderIncludedPacket writes a packet already containing a network // header through the given route. func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index cd1e34085..5898f8f9e 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -97,9 +97,8 @@ func (e *endpoint) GSOMaxSize() uint32 { return 0 } -// WritePacket writes a packet to the given destination address and protocol. -func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error { - length := uint16(hdr.UsedLength() + payload.Size()) +func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) { + length := uint16(hdr.UsedLength() + payloadSize) ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) ip.Encode(&header.IPv6Fields{ PayloadLength: length, @@ -109,6 +108,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen SrcAddr: r.LocalAddress, DstAddr: r.RemoteAddress, }) +} + +// WritePacket writes a packet to the given destination address and protocol. +func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) *tcpip.Error { + e.addIPHeader(r, &hdr, payload.Size(), params) if loop&stack.PacketLoop != 0 { views := make([]buffer.View, 1, 1+len(payload.Views())) @@ -127,6 +131,26 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return e.linkEP.WritePacket(r, gso, hdr, payload, ProtocolNumber) } +// WritePackets implements stack.LinkEndpoint.WritePackets. +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) { + if loop&stack.PacketLoop != 0 { + panic("not implemented") + } + if loop&stack.PacketOut == 0 { + return len(hdrs), nil + } + + for i := range hdrs { + hdr := &hdrs[i].Hdr + size := hdrs[i].Size + e.addIPHeader(r, hdr, size, params) + } + + n, err := e.linkEP.WritePackets(r, gso, hdrs, payload, ProtocolNumber) + r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) + return n, err +} + // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet // supported by IPv6. func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go index f655d011f..0869fb084 100644 --- a/pkg/tcpip/stack/registration.go +++ b/pkg/tcpip/stack/registration.go @@ -201,6 +201,10 @@ type NetworkEndpoint interface { // protocol. WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, params NetworkHeaderParams, loop PacketLooping) *tcpip.Error + // WritePackets writes packets to the given destination address and + // protocol. + WritePackets(r *Route, gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, params NetworkHeaderParams, loop PacketLooping) (int, *tcpip.Error) + // WriteHeaderIncludedPacket writes a packet that includes a network // header to the given destination address. WriteHeaderIncludedPacket(r *Route, payload buffer.VectorisedView, loop PacketLooping) *tcpip.Error @@ -283,7 +287,11 @@ const ( CapabilitySaveRestore CapabilityDisconnectOk CapabilityLoopback - CapabilityGSO + CapabilityHardwareGSO + + // CapabilitySoftwareGSO indicates the link endpoint supports of sending + // multiple packets using a single call (LinkEndpoint.WritePackets). + CapabilitySoftwareGSO ) // LinkEndpoint is the interface implemented by data link layer protocols (e.g., @@ -318,6 +326,14 @@ type LinkEndpoint interface { // r.LocalLinkAddress if it is provided. WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error + // WritePackets writes packets with the given protocol through the + // given route. + // + // Right now, WritePackets is used only when the software segmentation + // offload is enabled. If it will be used for something else, it may + // require to change syscall filters. + WritePackets(r *Route, gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) + // WriteRawPacket writes a packet directly to the link. The packet // should already have an ethernet header. WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error @@ -421,8 +437,14 @@ type GSOType int // Types of gso segments. const ( GSONone GSOType = iota + + // Hardware GSO types: GSOTCPv4 GSOTCPv6 + + // GSOSW is used for software GSO segments which have to be sent by + // endpoint.WritePackets. + GSOSW ) // GSO contains generic segmentation offload properties. @@ -450,3 +472,7 @@ type GSOEndpoint interface { // GSOMaxSize returns the maximum GSO packet size. GSOMaxSize() uint32 } + +// SoftwareGSOMaxSize is a maximum allowed size of a software GSO segment. +// This isn't a hard limit, because it is never set into packet headers. +const SoftwareGSOMaxSize = (1 << 16) diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go index e72373964..1a0a51b57 100644 --- a/pkg/tcpip/stack/route.go +++ b/pkg/tcpip/stack/route.go @@ -47,8 +47,8 @@ type Route struct { // starts. ref *referencedNetworkEndpoint - // loop controls where WritePacket should send packets. - loop PacketLooping + // Loop controls where WritePacket should send packets. + Loop PacketLooping } // makeRoute initializes a new route. It takes ownership of the provided @@ -69,7 +69,7 @@ func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip LocalLinkAddress: localLinkAddr, RemoteAddress: remoteAddr, ref: ref, - loop: loop, + Loop: loop, } } @@ -159,7 +159,7 @@ func (r *Route) WritePacket(gso *GSO, hdr buffer.Prependable, payload buffer.Vec return tcpip.ErrInvalidEndpointState } - err := r.ref.ep.WritePacket(r, gso, hdr, payload, params, r.loop) + err := r.ref.ep.WritePacket(r, gso, hdr, payload, params, r.Loop) if err != nil { r.Stats().IP.OutgoingPacketErrors.Increment() } else { @@ -169,6 +169,44 @@ func (r *Route) WritePacket(gso *GSO, hdr buffer.Prependable, payload buffer.Vec return err } +// PacketDescriptor is a packet descriptor which contains a packet header and +// offset and size of packet data in a payload view. +type PacketDescriptor struct { + Hdr buffer.Prependable + Off int + Size int +} + +// NewPacketDescriptors allocates a set of packet descriptors. +func NewPacketDescriptors(n int, hdrSize int) []PacketDescriptor { + buf := make([]byte, n*hdrSize) + hdrs := make([]PacketDescriptor, n) + for i := range hdrs { + hdrs[i].Hdr = buffer.NewEmptyPrependableFromView(buf[i*hdrSize:][:hdrSize]) + } + return hdrs +} + +// WritePackets writes the set of packets through the given route. +func (r *Route) WritePackets(gso *GSO, hdrs []PacketDescriptor, payload buffer.VectorisedView, params NetworkHeaderParams) (int, *tcpip.Error) { + if !r.ref.isValidForOutgoing() { + return 0, tcpip.ErrInvalidEndpointState + } + + n, err := r.ref.ep.WritePackets(r, gso, hdrs, payload, params, r.Loop) + if err != nil { + r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(len(hdrs) - n)) + } + r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n)) + payloadSize := 0 + for i := 0; i < n; i++ { + r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(hdrs[i].Hdr.UsedLength())) + payloadSize += hdrs[i].Size + } + r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(payloadSize)) + return n, err +} + // WriteHeaderIncludedPacket writes a packet already containing a network // header through the given route. func (r *Route) WriteHeaderIncludedPacket(payload buffer.VectorisedView) *tcpip.Error { @@ -176,7 +214,7 @@ func (r *Route) WriteHeaderIncludedPacket(payload buffer.VectorisedView) *tcpip. return tcpip.ErrInvalidEndpointState } - if err := r.ref.ep.WriteHeaderIncludedPacket(r, payload, r.loop); err != nil { + if err := r.ref.ep.WriteHeaderIncludedPacket(r, payload, r.Loop); err != nil { r.Stats().IP.OutgoingPacketErrors.Increment() return err } diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index 0a184fabe..72eee9334 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -146,6 +146,11 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr bu return f.ep.WritePacket(r, gso, hdr, payload, fakeNetNumber) } +// WritePackets implements stack.LinkEndpoint.WritePackets. +func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) { + panic("not implemented") +} + func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, payload buffer.VectorisedView, loop stack.PacketLooping) *tcpip.Error { return tcpip.ErrNotSupported } diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index aed70e06f..f1dbc6f91 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -44,6 +44,7 @@ go_library( imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ + "//pkg/log", "//pkg/rand", "//pkg/sleep", "//pkg/tcpip", diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 5ea036bea..082135374 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -607,17 +607,11 @@ func (e *endpoint) sendTCP(r *stack.Route, id stack.TransportEndpointID, data bu return nil } -// sendTCP sends a TCP segment with the provided options via the provided -// network endpoint and under the provided identity. -func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error { +func buildTCPHdr(r *stack.Route, id stack.TransportEndpointID, d *stack.PacketDescriptor, data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) { optLen := len(opts) - // Allocate a buffer for the TCP header. - hdr := buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen) - - if rcvWnd > 0xffff { - rcvWnd = 0xffff - } - + hdr := &d.Hdr + packetSize := d.Size + off := d.Off // Initialize the header. tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen)) tcp.Encode(&header.TCPFields{ @@ -631,7 +625,7 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise }) copy(tcp[header.TCPMinimumSize:], opts) - length := uint16(hdr.UsedLength() + data.Size()) + length := uint16(hdr.UsedLength() + packetSize) xsum := r.PseudoHeaderChecksum(ProtocolNumber, length) // Only calculate the checksum if offloading isn't supported. if gso != nil && gso.NeedsCsum { @@ -641,14 +635,71 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise // header and data and get the right sum of the TCP packet. tcp.SetChecksum(xsum) } else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 { - xsum = header.ChecksumVV(data, xsum) + xsum = header.ChecksumVVWithOffset(data, xsum, off, packetSize) tcp.SetChecksum(^tcp.CalculateChecksum(xsum)) } +} + +func sendTCPBatch(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error { + optLen := len(opts) + if rcvWnd > 0xffff { + rcvWnd = 0xffff + } + + mss := int(gso.MSS) + n := (data.Size() + mss - 1) / mss + + hdrs := stack.NewPacketDescriptors(n, header.TCPMinimumSize+int(r.MaxHeaderLength())+optLen) + + size := data.Size() + off := 0 + for i := 0; i < n; i++ { + packetSize := mss + if packetSize > size { + packetSize = size + } + size -= packetSize + hdrs[i].Off = off + hdrs[i].Size = packetSize + buildTCPHdr(r, id, &hdrs[i], data, flags, seq, ack, rcvWnd, opts, gso) + off += packetSize + seq = seq.Add(seqnum.Size(packetSize)) + } + if ttl == 0 { + ttl = r.DefaultTTL() + } + sent, err := r.WritePackets(gso, hdrs, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}) + if err != nil { + r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent)) + } + r.Stats().TCP.SegmentsSent.IncrementBy(uint64(sent)) + return err +} + +// sendTCP sends a TCP segment with the provided options via the provided +// network endpoint and under the provided identity. +func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl, tos uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error { + optLen := len(opts) + if rcvWnd > 0xffff { + rcvWnd = 0xffff + } + + if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() { + return sendTCPBatch(r, id, data, ttl, tos, flags, seq, ack, rcvWnd, opts, gso) + } + + d := &stack.PacketDescriptor{ + Hdr: buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen), + Off: 0, + Size: data.Size(), + } + buildTCPHdr(r, id, d, data, flags, seq, ack, rcvWnd, opts, gso) + if ttl == 0 { ttl = r.DefaultTTL() } - if err := r.WritePacket(gso, hdr, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}); err != nil { + if err := r.WritePacket(gso, d.Hdr, data, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}); err != nil { r.Stats().TCP.SegmentSendErrors.Increment() return err } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index a1b784b49..8b9cb4c33 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -2327,11 +2327,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState { return s } -func (e *endpoint) initGSO() { - if e.route.Capabilities()&stack.CapabilityGSO == 0 { - return - } - +func (e *endpoint) initHardwareGSO() { gso := &stack.GSO{} switch e.route.NetProto { case header.IPv4ProtocolNumber: @@ -2349,6 +2345,18 @@ func (e *endpoint) initGSO() { e.gso = gso } +func (e *endpoint) initGSO() { + if e.route.Capabilities()&stack.CapabilityHardwareGSO != 0 { + e.initHardwareGSO() + } else if e.route.Capabilities()&stack.CapabilitySoftwareGSO != 0 { + e.gso = &stack.GSO{ + MaxSize: e.route.GSOMaxSize(), + Type: stack.GSOSW, + NeedsCsum: false, + } + } +} + // State implements tcpip.Endpoint.State. It exports the endpoint's protocol // state for diagnostics. func (e *endpoint) State() uint32 { diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 01a29e8d5..72a33534f 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -178,8 +178,11 @@ type Config struct { // capabilities. EnableRaw bool - // GSO indicates that generic segmentation offload is enabled. - GSO bool + // HardwareGSO indicates that hardware segmentation offload is enabled. + HardwareGSO bool + + // SoftwareGSO indicates that software segmentation offload is enabled. + SoftwareGSO bool // LogPackets indicates that all network packets should be logged. LogPackets bool @@ -275,6 +278,8 @@ func (c *Config) ToFlags() []string { "--rootless=" + strconv.FormatBool(c.Rootless), "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr), "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode), + "--gso=" + strconv.FormatBool(c.HardwareGSO), + "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), } // Only include these if set since it is never to be used by users. diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index efbf1fd4a..5ad108261 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -243,6 +243,15 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowValue(0), }, }, + unix.SYS_SENDMMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT), + seccomp.AllowValue(0), + }, + }, syscall.SYS_RESTART_SYSCALL: {}, syscall.SYS_RT_SIGACTION: {}, syscall.SYS_RT_SIGPROCMASK: {}, diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 32cba5ac1..f98c5fd36 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -50,12 +50,13 @@ type DefaultRoute struct { // FDBasedLink configures an fd-based link. type FDBasedLink struct { - Name string - MTU int - Addresses []net.IP - Routes []Route - GSOMaxSize uint32 - LinkAddress net.HardwareAddr + Name string + MTU int + Addresses []net.IP + Routes []Route + GSOMaxSize uint32 + SoftwareGSOEnabled bool + LinkAddress net.HardwareAddr // NumChannels controls how many underlying FD's are to be used to // create this endpoint. @@ -163,6 +164,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct Address: mac, PacketDispatchMode: fdbased.RecvMMsg, GSOMaxSize: link.GSOMaxSize, + SoftwareGSOEnabled: link.SoftwareGSOEnabled, RXChecksumOffload: true, }) if err != nil { diff --git a/runsc/main.go b/runsc/main.go index 80b2d300c..ae906c661 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -66,7 +66,8 @@ var ( // Flags that control sandbox runtime behavior. platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.") network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") - gso = flag.Bool("gso", true, "enable generic segmenation offload.") + hardwareGSO = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.") + softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.") fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") @@ -200,7 +201,8 @@ func main() { FSGoferHostUDS: *fsGoferHostUDS, Overlay: *overlay, Network: netType, - GSO: *gso, + HardwareGSO: *hardwareGSO, + SoftwareGSO: *softwareGSO, LogPackets: *logPackets, Platform: platformType, Strace: *strace, diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 7fdceaab6..27459e6d1 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -19,6 +19,7 @@ go_library( "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/platform", + "//pkg/tcpip/stack", "//pkg/urpc", "//runsc/boot", "//runsc/boot/platforms", diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 5634f0707..d42de0176 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -28,6 +28,7 @@ import ( "github.com/vishvananda/netlink" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/specutils" @@ -61,7 +62,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi // Build the path to the net namespace of the sandbox process. // This is what we will copy. nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") - if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil { + if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels); err != nil { return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) } case boot.NetworkHost: @@ -136,7 +137,7 @@ func isRootNS() (bool, error) { // createInterfacesAndRoutesFromNS scrapes the interface and routes from the // net namespace with the given path, creates them in the sandbox, and removes // them from the host. -func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error { +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int) error { // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { @@ -232,7 +233,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO // Create the socket for the device. for i := 0; i < link.NumChannels; i++ { log.Debugf("Creating Channel %d", i) - socketEntry, err := createSocket(iface, ifaceLink, enableGSO) + socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO) if err != nil { return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err) } @@ -246,6 +247,11 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO } args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) } + if link.GSOMaxSize == 0 && softwareGSO { + // Hardware GSO is disabled. Let's enable software GSO. + link.GSOMaxSize = stack.SoftwareGSOMaxSize + link.SoftwareGSOEnabled = true + } // Collect the addresses for the interface, enable forwarding, // and remove them from the host. diff --git a/scripts/swgso_tests.sh b/scripts/swgso_tests.sh new file mode 100755 index 000000000..0de2df1d2 --- /dev/null +++ b/scripts/swgso_tests.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +source $(dirname $0)/common.sh + +# Install the runtime and perform basic tests. +install_runsc_for_test swgso --software-gso=true --gso=false +test_runsc //test/image:image_test //test/e2e:integration_test -- cgit v1.2.3