summaryrefslogtreecommitdiffhomepage
path: root/pkg/tcpip/transport
diff options
context:
space:
mode:
authorAndrei Vagin <avagin@google.com>2019-03-28 11:02:23 -0700
committerShentubot <shentubot@google.com>2019-03-28 11:03:41 -0700
commitf4105ac21a9f11f5231681239ca92ac814b5149d (patch)
tree2ecd11f283e674bce8cd29b514ae0745fdd0fa83 /pkg/tcpip/transport
parent9c188978870051f0b42ceb1a3f16320286936976 (diff)
netstack/fdbased: add generic segmentation offload (GSO) support
The linux packet socket can handle GSO packets, so we can segment packets to 64K instead of the MTU which is usually 1500. Here are numbers for the nginx-1m test: runsc: 579330.01 [Kbytes/sec] received runsc-gso: 1794121.66 [Kbytes/sec] received runc: 2122139.06 [Kbytes/sec] received and for tcp_benchmark: $ tcp_benchmark --duration 15 --ideal [ 4] 0.0-15.0 sec 86647 MBytes 48456 Mbits/sec $ tcp_benchmark --client --duration 15 --ideal [ 4] 0.0-15.0 sec 2173 MBytes 1214 Mbits/sec $ tcp_benchmark --client --duration 15 --ideal --gso 65536 [ 4] 0.0-15.0 sec 19357 MBytes 10825 Mbits/sec PiperOrigin-RevId: 240809103 Change-Id: I2637f104db28b5d4c64e1e766c610162a195775a
Diffstat (limited to 'pkg/tcpip/transport')
-rw-r--r--pkg/tcpip/transport/icmp/endpoint.go6
-rw-r--r--pkg/tcpip/transport/tcp/accept.go2
-rw-r--r--pkg/tcpip/transport/tcp/connect.go21
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go27
-rw-r--r--pkg/tcpip/transport/tcp/protocol.go2
-rw-r--r--pkg/tcpip/transport/tcp/snd.go45
-rw-r--r--pkg/tcpip/transport/tcp/testing/context/context.go4
-rw-r--r--pkg/tcpip/transport/udp/endpoint.go2
8 files changed, 91 insertions, 18 deletions
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index d876005fe..182097b46 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -370,7 +370,7 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
func (e *endpoint) send4(r *stack.Route, data buffer.View) *tcpip.Error {
if e.raw {
hdr := buffer.NewPrependable(len(data) + int(r.MaxHeaderLength()))
- return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL())
+ return r.WritePacket(nil /* gso */, hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL())
}
if len(data) < header.ICMPv4EchoMinimumSize {
@@ -395,7 +395,7 @@ func (e *endpoint) send4(r *stack.Route, data buffer.View) *tcpip.Error {
icmpv4.SetChecksum(0)
icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0)))
- return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL())
+ return r.WritePacket(nil /* gso */, hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL())
}
func send6(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
@@ -419,7 +419,7 @@ func send6(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
icmpv6.SetChecksum(0)
icmpv6.SetChecksum(^header.Checksum(icmpv6, header.Checksum(data, 0)))
- return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv6ProtocolNumber, r.DefaultTTL())
+ return r.WritePacket(nil /* gso */, hdr, data.ToVectorisedView(), header.ICMPv6ProtocolNumber, r.DefaultTTL())
}
func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 7a19737c7..a3894ed8f 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -214,6 +214,8 @@ func (l *listenContext) createConnectedEndpoint(s *segment, iss seqnum.Value, ir
n.maybeEnableTimestamp(rcvdSynOpts)
n.maybeEnableSACKPermitted(rcvdSynOpts)
+ n.initGSO()
+
// Register new endpoint so that packets are routed to it.
if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.id, n, n.reusePort); err != nil {
n.Close()
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index c4353718e..056e0b09a 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -557,14 +557,14 @@ func sendSynTCP(r *stack.Route, id stack.TransportEndpointID, flags byte, seq, a
}
options := makeSynOptions(opts)
- err := sendTCP(r, id, buffer.VectorisedView{}, r.DefaultTTL(), flags, seq, ack, rcvWnd, options)
+ err := sendTCP(r, id, buffer.VectorisedView{}, r.DefaultTTL(), flags, seq, ack, rcvWnd, options, nil)
putOptions(options)
return err
}
// sendTCP sends a TCP segment with the provided options via the provided
// network endpoint and under the provided identity.
-func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte) *tcpip.Error {
+func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error {
optLen := len(opts)
// Allocate a buffer for the TCP header.
hdr := buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen)
@@ -586,12 +586,17 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
})
copy(tcp[header.TCPMinimumSize:], opts)
+ length := uint16(hdr.UsedLength() + data.Size())
+ xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
// Only calculate the checksum if offloading isn't supported.
- if r.Capabilities()&stack.CapabilityChecksumOffload == 0 {
- length := uint16(hdr.UsedLength() + data.Size())
- xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
+ if gso != nil && gso.NeedsCsum {
+ // This is called CHECKSUM_PARTIAL in the Linux kernel. We
+ // calculate a checksum of the pseudo-header and save it in the
+ // TCP header, then the kernel calculate a checksum of the
+ // header and data and get the right sum of the TCP packet.
+ tcp.SetChecksum(xsum)
+ } else if r.Capabilities()&stack.CapabilityChecksumOffload == 0 {
xsum = header.ChecksumVV(data, xsum)
-
tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
}
@@ -600,7 +605,7 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
r.Stats().TCP.ResetsSent.Increment()
}
- return r.WritePacket(hdr, data, ProtocolNumber, ttl)
+ return r.WritePacket(gso, hdr, data, ProtocolNumber, ttl)
}
// makeOptions makes an options slice.
@@ -649,7 +654,7 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqn
sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
}
options := e.makeOptions(sackBlocks)
- err := sendTCP(&e.route, e.id, data, e.route.DefaultTTL(), flags, seq, ack, rcvWnd, options)
+ err := sendTCP(&e.route, e.id, data, e.route.DefaultTTL(), flags, seq, ack, rcvWnd, options, e.gso)
putOptions(options)
return err
}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 5656890f6..0427af34f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -15,6 +15,7 @@
package tcp
import (
+ "fmt"
"math"
"sync"
"sync/atomic"
@@ -265,6 +266,8 @@ type endpoint struct {
// The following are only used to assist the restore run to re-connect.
bindAddress tcpip.Address
connectingAddress tcpip.Address
+
+ gso *stack.GSO
}
// StopWork halts packet processing. Only to be used in tests.
@@ -1155,6 +1158,8 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
e.effectiveNetProtos = netProtos
e.connectingAddress = connectingAddr
+ e.initGSO()
+
// Connect in the restore phase does not perform handshake. Restore its
// connection setting here.
if !handshake {
@@ -1698,3 +1703,25 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
}
return s
}
+
+func (e *endpoint) initGSO() {
+ if e.route.Capabilities()&stack.CapabilityGSO == 0 {
+ return
+ }
+
+ gso := &stack.GSO{}
+ switch e.netProto {
+ case header.IPv4ProtocolNumber:
+ gso.Type = stack.GSOTCPv4
+ gso.L3HdrLen = header.IPv4MinimumSize
+ case header.IPv6ProtocolNumber:
+ gso.Type = stack.GSOTCPv6
+ gso.L3HdrLen = header.IPv6MinimumSize
+ default:
+ panic(fmt.Sprintf("Unknown netProto: %v", e.netProto))
+ }
+ gso.NeedsCsum = true
+ gso.CsumOffset = header.TCPChecksumOffset()
+ gso.MaxSize = e.route.GSOMaxSize()
+ e.gso = gso
+}
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 8a42f8593..230668b5d 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -153,7 +153,7 @@ func replyWithReset(s *segment) {
ack := s.sequenceNumber.Add(s.logicalLen())
- sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0, nil)
+ sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0, nil /* options */, nil /* gso */)
}
// SetOption implements TransportProtocol.SetOption.
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index d751c7d8e..6317748cf 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -129,6 +129,9 @@ type sender struct {
// It is initialized on demand.
maxPayloadSize int
+ // gso is set if generic segmentation offload is enabled.
+ gso bool
+
// sndWndScale is the number of bits to shift left when reading the send
// window size from a segment.
sndWndScale uint8
@@ -194,6 +197,11 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
last: iss,
},
+ gso: ep.gso != nil,
+ }
+
+ if s.gso {
+ s.ep.gso.MSS = uint16(maxPayloadSize)
}
s.cc = s.initCongestionControl(ep.cc)
@@ -244,6 +252,9 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) {
}
s.maxPayloadSize = m
+ if s.gso {
+ s.ep.gso.MSS = uint16(m)
+ }
s.outstanding -= count
if s.outstanding < 0 {
@@ -338,6 +349,15 @@ func (s *sender) resendSegment() {
// Resend the segment.
if seg := s.writeList.Front(); seg != nil {
+ if seg.data.Size() > s.maxPayloadSize {
+ available := s.maxPayloadSize
+ // Split this segment up.
+ nSeg := seg.clone()
+ nSeg.data.TrimFront(available)
+ nSeg.sequenceNumber.UpdateForward(seqnum.Size(available))
+ s.writeList.InsertAfter(seg, nSeg)
+ seg.data.CapLength(available)
+ }
s.sendSegment(seg.data, seg.flags, seg.sequenceNumber)
s.ep.stack.Stats().TCP.FastRetransmit.Increment()
s.ep.stack.Stats().TCP.Retransmits.Increment()
@@ -408,11 +428,24 @@ func (s *sender) retransmitTimerExpired() bool {
return true
}
+// pCount returns the number of packets in the segment. Due to GSO, a segment
+// can be composed of multiple packets.
+func (s *sender) pCount(seg *segment) int {
+ size := seg.data.Size()
+ if size == 0 {
+ return 1
+ }
+
+ return (size-1)/s.maxPayloadSize + 1
+}
+
// sendData sends new data segments. It is called when data becomes available or
// when the send window opens up.
func (s *sender) sendData() {
limit := s.maxPayloadSize
-
+ if s.gso {
+ limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize)
+ }
// Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10.
// "A TCP SHOULD set cwnd to no more than RW before beginning
// transmission if the TCP has not sent data in the interval exceeding
@@ -427,6 +460,10 @@ func (s *sender) sendData() {
end := s.sndUna.Add(s.sndWnd)
var dataSent bool
for ; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() {
+ cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize
+ if cwndLimit < limit {
+ limit = cwndLimit
+ }
// We abuse the flags field to determine if we have already
// assigned a sequence number to this segment.
if seg.flags == 0 {
@@ -518,7 +555,7 @@ func (s *sender) sendData() {
seg.data.CapLength(available)
}
- s.outstanding++
+ s.outstanding += s.pCount(seg)
segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
}
@@ -744,8 +781,10 @@ func (s *sender) handleRcvdSegment(seg *segment) {
datalen := seg.logicalLen()
if datalen > ackLeft {
+ prevCount := s.pCount(seg)
seg.data.TrimFront(int(ackLeft))
seg.sequenceNumber.UpdateForward(ackLeft)
+ s.outstanding -= prevCount - s.pCount(seg)
break
}
@@ -753,7 +792,7 @@ func (s *sender) handleRcvdSegment(seg *segment) {
s.writeNext = seg.Next()
}
s.writeList.Remove(seg)
- s.outstanding--
+ s.outstanding -= s.pCount(seg)
seg.decRef()
ackLeft -= datalen
}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index aa2a73829..5cef8ee97 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -699,7 +699,7 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
synOptions := header.ParseSynOptions(tcpSeg.Options(), false)
// Build options w/ tsVal to be sent in the SYN-ACK.
- synAckOptions := make([]byte, 40)
+ synAckOptions := make([]byte, header.TCPOptionsMaximumSize)
offset := 0
if wantOptions.TS {
offset += header.EncodeTSOption(wantOptions.TSVal, synOptions.TSVal, synAckOptions[offset:])
@@ -847,7 +847,7 @@ func (c *Context) PassiveConnect(maxPayload, wndScale int, synOptions header.TCP
// value of the window scaling option to be sent in the SYN. If synOptions.WS >
// 0 then we send the WindowScale option.
func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions header.TCPSynOptions) *RawEndpoint {
- opts := make([]byte, 40)
+ opts := make([]byte, header.TCPOptionsMaximumSize)
offset := 0
offset += header.EncodeMSSOption(uint32(maxPayload), opts)
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index b68ed8561..5637f46e3 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -651,7 +651,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
// Track count of packets sent.
r.Stats().UDP.PacketsSent.Increment()
- return r.WritePacket(hdr, data, ProtocolNumber, ttl)
+ return r.WritePacket(nil /* gso */, hdr, data, ProtocolNumber, ttl)
}
func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {