From 8720bd643ef37a693984baba83c37458f6691df1 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 22 Oct 2019 11:54:14 -0700 Subject: netstack/tcp: software segmentation offload Right now, we send each tcp packet separately, we call one system call per-packet. This patch allows to generate multiple tcp packets and send them by sendmmsg. The arguable part of this CL is a way how to handle multiple headers. This CL adds the next field to the Prepandable buffer. Nginx test results: Server Software: nginx/1.15.9 Server Hostname: 10.138.0.2 Server Port: 8080 Document Path: /10m.txt Document Length: 10485760 bytes w/o gso: Concurrency Level: 5 Time taken for tests: 5.491 seconds Complete requests: 100 Failed requests: 0 Total transferred: 1048600200 bytes HTML transferred: 1048576000 bytes Requests per second: 18.21 [#/sec] (mean) Time per request: 274.525 [ms] (mean) Time per request: 54.905 [ms] (mean, across all concurrent requests) Transfer rate: 186508.03 [Kbytes/sec] received sw-gso: Concurrency Level: 5 Time taken for tests: 3.852 seconds Complete requests: 100 Failed requests: 0 Total transferred: 1048600200 bytes HTML transferred: 1048576000 bytes Requests per second: 25.96 [#/sec] (mean) Time per request: 192.576 [ms] (mean) Time per request: 38.515 [ms] (mean, across all concurrent requests) Transfer rate: 265874.92 [Kbytes/sec] received w/o gso: $ ./tcp_benchmark --client --duration 15 --ideal [SUM] 0.0-15.1 sec 2.20 GBytes 1.25 Gbits/sec software gso: $ tcp_benchmark --client --duration 15 --ideal --gso $((1<<16)) --swgso [SUM] 0.0-15.1 sec 3.99 GBytes 2.26 Gbits/sec PiperOrigin-RevId: 276112677 --- pkg/tcpip/link/fdbased/endpoint.go | 128 ++++++++++++++++++++++++++- pkg/tcpip/link/fdbased/packet_dispatchers.go | 12 +-- 2 files changed, 132 insertions(+), 8 deletions(-) (limited to 'pkg/tcpip/link/fdbased') diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 255220d1a..ae4858529 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -165,6 +165,9 @@ type Options struct { // disabled. GSOMaxSize uint32 + // SoftwareGSOEnabled indicates whether software GSO is enabled or not. + SoftwareGSOEnabled bool + // PacketDispatchMode specifies the type of inbound dispatcher to be // used for this endpoint. PacketDispatchMode PacketDispatchMode @@ -242,7 +245,11 @@ func New(opts *Options) (stack.LinkEndpoint, error) { } if isSocket { if opts.GSOMaxSize != 0 { - e.caps |= stack.CapabilityGSO + if opts.SoftwareGSOEnabled { + e.caps |= stack.CapabilitySoftwareGSO + } else { + e.caps |= stack.CapabilityHardwareGSO + } e.gsoMaxSize = opts.GSOMaxSize } } @@ -397,7 +404,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen eth.Encode(ethHdr) } - if e.Capabilities()&stack.CapabilityGSO != 0 { + if e.Capabilities()&stack.CapabilityHardwareGSO != 0 { vnetHdr := virtioNetHdr{} vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr) if gso != nil { @@ -430,6 +437,123 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil) } +// WritePackets writes outbound packets to the file descriptor. If it is not +// currently writable, the packet is dropped. +func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, hdrs []stack.PacketDescriptor, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + var ethHdrBuf []byte + // hdr + data + iovLen := 2 + if e.hdrSize > 0 { + // Add ethernet header if needed. + ethHdrBuf = make([]byte, header.EthernetMinimumSize) + eth := header.Ethernet(ethHdrBuf) + ethHdr := &header.EthernetFields{ + DstAddr: r.RemoteLinkAddress, + Type: protocol, + } + + // Preserve the src address if it's set in the route. + if r.LocalLinkAddress != "" { + ethHdr.SrcAddr = r.LocalLinkAddress + } else { + ethHdr.SrcAddr = e.addr + } + eth.Encode(ethHdr) + iovLen++ + } + + n := len(hdrs) + + views := payload.Views() + /* + * Each bondary in views can add one more iovec. + * + * payload | | | | + * ----------------------------- + * packets | | | | | | | + * ----------------------------- + * iovecs | | | | | | | | | + */ + iovec := make([]syscall.Iovec, n*iovLen+len(views)-1) + mmsgHdrs := make([]rawfile.MMsgHdr, n) + + iovecIdx := 0 + viewIdx := 0 + viewOff := 0 + off := 0 + nextOff := 0 + for i := range hdrs { + prevIovecIdx := iovecIdx + mmsgHdr := &mmsgHdrs[i] + mmsgHdr.Msg.Iov = &iovec[iovecIdx] + packetSize := hdrs[i].Size + hdr := &hdrs[i].Hdr + + off = hdrs[i].Off + if off != nextOff { + // We stop in a different point last time. + size := packetSize + viewIdx = 0 + viewOff = 0 + for size > 0 { + if size >= len(views[viewIdx]) { + viewIdx++ + viewOff = 0 + size -= len(views[viewIdx]) + } else { + viewOff = size + size = 0 + } + } + } + nextOff = off + packetSize + + if ethHdrBuf != nil { + v := &iovec[iovecIdx] + v.Base = ðHdrBuf[0] + v.Len = uint64(len(ethHdrBuf)) + iovecIdx++ + } + + v := &iovec[iovecIdx] + hdrView := hdr.View() + v.Base = &hdrView[0] + v.Len = uint64(len(hdrView)) + iovecIdx++ + + for packetSize > 0 { + vec := &iovec[iovecIdx] + iovecIdx++ + + v := views[viewIdx] + vec.Base = &v[viewOff] + s := len(v) - viewOff + if s <= packetSize { + viewIdx++ + viewOff = 0 + } else { + s = packetSize + viewOff += s + } + vec.Len = uint64(s) + packetSize -= s + } + + mmsgHdr.Msg.Iovlen = uint64(iovecIdx - prevIovecIdx) + } + + packets := 0 + for packets < n { + sent, err := rawfile.NonBlockingSendMMsg(e.fds[0], mmsgHdrs) + if err != nil { + return packets, err + } + packets += sent + mmsgHdrs = mmsgHdrs[sent:] + } + return packets, nil +} + // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. func (e *endpoint) WriteRawPacket(packet buffer.VectorisedView) *tcpip.Error { return rawfile.NonBlockingWrite(e.fds[0], packet.ToView()) diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go index 9fb760891..12168a1dc 100644 --- a/pkg/tcpip/link/fdbased/packet_dispatchers.go +++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go @@ -53,7 +53,7 @@ func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { d := &readVDispatcher{fd: fd, e: e} d.views = make([]buffer.View, len(BufConfig)) iovLen := len(BufConfig) - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { iovLen++ } d.iovecs = make([]syscall.Iovec, iovLen) @@ -63,7 +63,7 @@ func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { func (d *readVDispatcher) allocateViews(bufConfig []int) { var vnetHdr [virtioNetHdrSize]byte vnetHdrOff := 0 - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { // The kernel adds virtioNetHdr before each packet, but // we don't use it, so so we allocate a buffer for it, // add it in iovecs but don't add it in a view. @@ -106,7 +106,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) { if err != nil { return false, err } - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { // Skip virtioNetHdr which is added before each packet, it // isn't used and it isn't in a view. n -= virtioNetHdrSize @@ -195,7 +195,7 @@ func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) { } d.iovecs = make([][]syscall.Iovec, MaxMsgsPerRecv) iovLen := len(BufConfig) - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { // virtioNetHdr is prepended before each packet. iovLen++ } @@ -226,7 +226,7 @@ func (d *recvMMsgDispatcher) allocateViews(bufConfig []int) { for k := 0; k < len(d.views); k++ { var vnetHdr [virtioNetHdrSize]byte vnetHdrOff := 0 - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { // The kernel adds virtioNetHdr before each packet, but // we don't use it, so so we allocate a buffer for it, // add it in iovecs but don't add it in a view. @@ -262,7 +262,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) { // Process each of received packets. for k := 0; k < nMsgs; k++ { n := int(d.msgHdrs[k].Len) - if d.e.Capabilities()&stack.CapabilityGSO != 0 { + if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 { n -= virtioNetHdrSize } if n <= d.e.hdrSize { -- cgit v1.2.3