diff options
author | Andrei Vagin <avagin@google.com> | 2019-03-28 11:02:23 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2019-03-28 11:03:41 -0700 |
commit | f4105ac21a9f11f5231681239ca92ac814b5149d (patch) | |
tree | 2ecd11f283e674bce8cd29b514ae0745fdd0fa83 /pkg/tcpip/transport | |
parent | 9c188978870051f0b42ceb1a3f16320286936976 (diff) |
netstack/fdbased: add generic segmentation offload (GSO) support
The linux packet socket can handle GSO packets, so we can segment packets to
64K instead of the MTU which is usually 1500.
Here are numbers for the nginx-1m test:
runsc: 579330.01 [Kbytes/sec] received
runsc-gso: 1794121.66 [Kbytes/sec] received
runc: 2122139.06 [Kbytes/sec] received
and for tcp_benchmark:
$ tcp_benchmark --duration 15 --ideal
[ 4] 0.0-15.0 sec 86647 MBytes 48456 Mbits/sec
$ tcp_benchmark --client --duration 15 --ideal
[ 4] 0.0-15.0 sec 2173 MBytes 1214 Mbits/sec
$ tcp_benchmark --client --duration 15 --ideal --gso 65536
[ 4] 0.0-15.0 sec 19357 MBytes 10825 Mbits/sec
PiperOrigin-RevId: 240809103
Change-Id: I2637f104db28b5d4c64e1e766c610162a195775a
Diffstat (limited to 'pkg/tcpip/transport')
-rw-r--r-- | pkg/tcpip/transport/icmp/endpoint.go | 6 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/accept.go | 2 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/connect.go | 21 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint.go | 27 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/protocol.go | 2 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/snd.go | 45 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/testing/context/context.go | 4 | ||||
-rw-r--r-- | pkg/tcpip/transport/udp/endpoint.go | 2 |
8 files changed, 91 insertions, 18 deletions
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index d876005fe..182097b46 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -370,7 +370,7 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { func (e *endpoint) send4(r *stack.Route, data buffer.View) *tcpip.Error { if e.raw { hdr := buffer.NewPrependable(len(data) + int(r.MaxHeaderLength())) - return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL()) + return r.WritePacket(nil /* gso */, hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL()) } if len(data) < header.ICMPv4EchoMinimumSize { @@ -395,7 +395,7 @@ func (e *endpoint) send4(r *stack.Route, data buffer.View) *tcpip.Error { icmpv4.SetChecksum(0) icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0))) - return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL()) + return r.WritePacket(nil /* gso */, hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL()) } func send6(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error { @@ -419,7 +419,7 @@ func send6(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error { icmpv6.SetChecksum(0) icmpv6.SetChecksum(^header.Checksum(icmpv6, header.Checksum(data, 0))) - return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv6ProtocolNumber, r.DefaultTTL()) + return r.WritePacket(nil /* gso */, hdr, data.ToVectorisedView(), header.ICMPv6ProtocolNumber, r.DefaultTTL()) } func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) { diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 7a19737c7..a3894ed8f 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -214,6 +214,8 @@ func (l *listenContext) createConnectedEndpoint(s *segment, iss seqnum.Value, ir n.maybeEnableTimestamp(rcvdSynOpts) n.maybeEnableSACKPermitted(rcvdSynOpts) + n.initGSO() + // Register new endpoint so that packets are routed to it. if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.id, n, n.reusePort); err != nil { n.Close() diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index c4353718e..056e0b09a 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -557,14 +557,14 @@ func sendSynTCP(r *stack.Route, id stack.TransportEndpointID, flags byte, seq, a } options := makeSynOptions(opts) - err := sendTCP(r, id, buffer.VectorisedView{}, r.DefaultTTL(), flags, seq, ack, rcvWnd, options) + err := sendTCP(r, id, buffer.VectorisedView{}, r.DefaultTTL(), flags, seq, ack, rcvWnd, options, nil) putOptions(options) return err } // sendTCP sends a TCP segment with the provided options via the provided // network endpoint and under the provided identity. -func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte) *tcpip.Error { +func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error { optLen := len(opts) // Allocate a buffer for the TCP header. hdr := buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen) @@ -586,12 +586,17 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise }) copy(tcp[header.TCPMinimumSize:], opts) + length := uint16(hdr.UsedLength() + data.Size()) + xsum := r.PseudoHeaderChecksum(ProtocolNumber, length) // Only calculate the checksum if offloading isn't supported. - if r.Capabilities()&stack.CapabilityChecksumOffload == 0 { - length := uint16(hdr.UsedLength() + data.Size()) - xsum := r.PseudoHeaderChecksum(ProtocolNumber, length) + if gso != nil && gso.NeedsCsum { + // This is called CHECKSUM_PARTIAL in the Linux kernel. We + // calculate a checksum of the pseudo-header and save it in the + // TCP header, then the kernel calculate a checksum of the + // header and data and get the right sum of the TCP packet. + tcp.SetChecksum(xsum) + } else if r.Capabilities()&stack.CapabilityChecksumOffload == 0 { xsum = header.ChecksumVV(data, xsum) - tcp.SetChecksum(^tcp.CalculateChecksum(xsum)) } @@ -600,7 +605,7 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise r.Stats().TCP.ResetsSent.Increment() } - return r.WritePacket(hdr, data, ProtocolNumber, ttl) + return r.WritePacket(gso, hdr, data, ProtocolNumber, ttl) } // makeOptions makes an options slice. @@ -649,7 +654,7 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqn sackBlocks = e.sack.Blocks[:e.sack.NumBlocks] } options := e.makeOptions(sackBlocks) - err := sendTCP(&e.route, e.id, data, e.route.DefaultTTL(), flags, seq, ack, rcvWnd, options) + err := sendTCP(&e.route, e.id, data, e.route.DefaultTTL(), flags, seq, ack, rcvWnd, options, e.gso) putOptions(options) return err } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 5656890f6..0427af34f 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -15,6 +15,7 @@ package tcp import ( + "fmt" "math" "sync" "sync/atomic" @@ -265,6 +266,8 @@ type endpoint struct { // The following are only used to assist the restore run to re-connect. bindAddress tcpip.Address connectingAddress tcpip.Address + + gso *stack.GSO } // StopWork halts packet processing. Only to be used in tests. @@ -1155,6 +1158,8 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er e.effectiveNetProtos = netProtos e.connectingAddress = connectingAddr + e.initGSO() + // Connect in the restore phase does not perform handshake. Restore its // connection setting here. if !handshake { @@ -1698,3 +1703,25 @@ func (e *endpoint) completeState() stack.TCPEndpointState { } return s } + +func (e *endpoint) initGSO() { + if e.route.Capabilities()&stack.CapabilityGSO == 0 { + return + } + + gso := &stack.GSO{} + switch e.netProto { + case header.IPv4ProtocolNumber: + gso.Type = stack.GSOTCPv4 + gso.L3HdrLen = header.IPv4MinimumSize + case header.IPv6ProtocolNumber: + gso.Type = stack.GSOTCPv6 + gso.L3HdrLen = header.IPv6MinimumSize + default: + panic(fmt.Sprintf("Unknown netProto: %v", e.netProto)) + } + gso.NeedsCsum = true + gso.CsumOffset = header.TCPChecksumOffset() + gso.MaxSize = e.route.GSOMaxSize() + e.gso = gso +} diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 8a42f8593..230668b5d 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -153,7 +153,7 @@ func replyWithReset(s *segment) { ack := s.sequenceNumber.Add(s.logicalLen()) - sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0, nil) + sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0, nil /* options */, nil /* gso */) } // SetOption implements TransportProtocol.SetOption. diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index d751c7d8e..6317748cf 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -129,6 +129,9 @@ type sender struct { // It is initialized on demand. maxPayloadSize int + // gso is set if generic segmentation offload is enabled. + gso bool + // sndWndScale is the number of bits to shift left when reading the send // window size from a segment. sndWndScale uint8 @@ -194,6 +197,11 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. last: iss, }, + gso: ep.gso != nil, + } + + if s.gso { + s.ep.gso.MSS = uint16(maxPayloadSize) } s.cc = s.initCongestionControl(ep.cc) @@ -244,6 +252,9 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) { } s.maxPayloadSize = m + if s.gso { + s.ep.gso.MSS = uint16(m) + } s.outstanding -= count if s.outstanding < 0 { @@ -338,6 +349,15 @@ func (s *sender) resendSegment() { // Resend the segment. if seg := s.writeList.Front(); seg != nil { + if seg.data.Size() > s.maxPayloadSize { + available := s.maxPayloadSize + // Split this segment up. + nSeg := seg.clone() + nSeg.data.TrimFront(available) + nSeg.sequenceNumber.UpdateForward(seqnum.Size(available)) + s.writeList.InsertAfter(seg, nSeg) + seg.data.CapLength(available) + } s.sendSegment(seg.data, seg.flags, seg.sequenceNumber) s.ep.stack.Stats().TCP.FastRetransmit.Increment() s.ep.stack.Stats().TCP.Retransmits.Increment() @@ -408,11 +428,24 @@ func (s *sender) retransmitTimerExpired() bool { return true } +// pCount returns the number of packets in the segment. Due to GSO, a segment +// can be composed of multiple packets. +func (s *sender) pCount(seg *segment) int { + size := seg.data.Size() + if size == 0 { + return 1 + } + + return (size-1)/s.maxPayloadSize + 1 +} + // sendData sends new data segments. It is called when data becomes available or // when the send window opens up. func (s *sender) sendData() { limit := s.maxPayloadSize - + if s.gso { + limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize) + } // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. // "A TCP SHOULD set cwnd to no more than RW before beginning // transmission if the TCP has not sent data in the interval exceeding @@ -427,6 +460,10 @@ func (s *sender) sendData() { end := s.sndUna.Add(s.sndWnd) var dataSent bool for ; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() { + cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize + if cwndLimit < limit { + limit = cwndLimit + } // We abuse the flags field to determine if we have already // assigned a sequence number to this segment. if seg.flags == 0 { @@ -518,7 +555,7 @@ func (s *sender) sendData() { seg.data.CapLength(available) } - s.outstanding++ + s.outstanding += s.pCount(seg) segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) } @@ -744,8 +781,10 @@ func (s *sender) handleRcvdSegment(seg *segment) { datalen := seg.logicalLen() if datalen > ackLeft { + prevCount := s.pCount(seg) seg.data.TrimFront(int(ackLeft)) seg.sequenceNumber.UpdateForward(ackLeft) + s.outstanding -= prevCount - s.pCount(seg) break } @@ -753,7 +792,7 @@ func (s *sender) handleRcvdSegment(seg *segment) { s.writeNext = seg.Next() } s.writeList.Remove(seg) - s.outstanding-- + s.outstanding -= s.pCount(seg) seg.decRef() ackLeft -= datalen } diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go index aa2a73829..5cef8ee97 100644 --- a/pkg/tcpip/transport/tcp/testing/context/context.go +++ b/pkg/tcpip/transport/tcp/testing/context/context.go @@ -699,7 +699,7 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) * synOptions := header.ParseSynOptions(tcpSeg.Options(), false) // Build options w/ tsVal to be sent in the SYN-ACK. - synAckOptions := make([]byte, 40) + synAckOptions := make([]byte, header.TCPOptionsMaximumSize) offset := 0 if wantOptions.TS { offset += header.EncodeTSOption(wantOptions.TSVal, synOptions.TSVal, synAckOptions[offset:]) @@ -847,7 +847,7 @@ func (c *Context) PassiveConnect(maxPayload, wndScale int, synOptions header.TCP // value of the window scaling option to be sent in the SYN. If synOptions.WS > // 0 then we send the WindowScale option. func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions header.TCPSynOptions) *RawEndpoint { - opts := make([]byte, 40) + opts := make([]byte, header.TCPOptionsMaximumSize) offset := 0 offset += header.EncodeMSSOption(uint32(maxPayload), opts) diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index b68ed8561..5637f46e3 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -651,7 +651,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u // Track count of packets sent. r.Stats().UDP.PacketsSent.Increment() - return r.WritePacket(hdr, data, ProtocolNumber, ttl) + return r.WritePacket(nil /* gso */, hdr, data, ProtocolNumber, ttl) } func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) { |