diff options
Diffstat (limited to 'pkg/tcpip/transport/tcp')
-rw-r--r-- | pkg/tcpip/transport/tcp/accept.go | 2 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/connect.go | 21 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint.go | 27 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/protocol.go | 2 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/snd.go | 45 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/testing/context/context.go | 4 |
6 files changed, 87 insertions, 14 deletions
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 7a19737c7..a3894ed8f 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -214,6 +214,8 @@ func (l *listenContext) createConnectedEndpoint(s *segment, iss seqnum.Value, ir n.maybeEnableTimestamp(rcvdSynOpts) n.maybeEnableSACKPermitted(rcvdSynOpts) + n.initGSO() + // Register new endpoint so that packets are routed to it. if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.id, n, n.reusePort); err != nil { n.Close() diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index c4353718e..056e0b09a 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -557,14 +557,14 @@ func sendSynTCP(r *stack.Route, id stack.TransportEndpointID, flags byte, seq, a } options := makeSynOptions(opts) - err := sendTCP(r, id, buffer.VectorisedView{}, r.DefaultTTL(), flags, seq, ack, rcvWnd, options) + err := sendTCP(r, id, buffer.VectorisedView{}, r.DefaultTTL(), flags, seq, ack, rcvWnd, options, nil) putOptions(options) return err } // sendTCP sends a TCP segment with the provided options via the provided // network endpoint and under the provided identity. -func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte) *tcpip.Error { +func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.VectorisedView, ttl uint8, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size, opts []byte, gso *stack.GSO) *tcpip.Error { optLen := len(opts) // Allocate a buffer for the TCP header. hdr := buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen) @@ -586,12 +586,17 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise }) copy(tcp[header.TCPMinimumSize:], opts) + length := uint16(hdr.UsedLength() + data.Size()) + xsum := r.PseudoHeaderChecksum(ProtocolNumber, length) // Only calculate the checksum if offloading isn't supported. - if r.Capabilities()&stack.CapabilityChecksumOffload == 0 { - length := uint16(hdr.UsedLength() + data.Size()) - xsum := r.PseudoHeaderChecksum(ProtocolNumber, length) + if gso != nil && gso.NeedsCsum { + // This is called CHECKSUM_PARTIAL in the Linux kernel. We + // calculate a checksum of the pseudo-header and save it in the + // TCP header, then the kernel calculate a checksum of the + // header and data and get the right sum of the TCP packet. + tcp.SetChecksum(xsum) + } else if r.Capabilities()&stack.CapabilityChecksumOffload == 0 { xsum = header.ChecksumVV(data, xsum) - tcp.SetChecksum(^tcp.CalculateChecksum(xsum)) } @@ -600,7 +605,7 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise r.Stats().TCP.ResetsSent.Increment() } - return r.WritePacket(hdr, data, ProtocolNumber, ttl) + return r.WritePacket(gso, hdr, data, ProtocolNumber, ttl) } // makeOptions makes an options slice. @@ -649,7 +654,7 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqn sackBlocks = e.sack.Blocks[:e.sack.NumBlocks] } options := e.makeOptions(sackBlocks) - err := sendTCP(&e.route, e.id, data, e.route.DefaultTTL(), flags, seq, ack, rcvWnd, options) + err := sendTCP(&e.route, e.id, data, e.route.DefaultTTL(), flags, seq, ack, rcvWnd, options, e.gso) putOptions(options) return err } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 5656890f6..0427af34f 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -15,6 +15,7 @@ package tcp import ( + "fmt" "math" "sync" "sync/atomic" @@ -265,6 +266,8 @@ type endpoint struct { // The following are only used to assist the restore run to re-connect. bindAddress tcpip.Address connectingAddress tcpip.Address + + gso *stack.GSO } // StopWork halts packet processing. Only to be used in tests. @@ -1155,6 +1158,8 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er e.effectiveNetProtos = netProtos e.connectingAddress = connectingAddr + e.initGSO() + // Connect in the restore phase does not perform handshake. Restore its // connection setting here. if !handshake { @@ -1698,3 +1703,25 @@ func (e *endpoint) completeState() stack.TCPEndpointState { } return s } + +func (e *endpoint) initGSO() { + if e.route.Capabilities()&stack.CapabilityGSO == 0 { + return + } + + gso := &stack.GSO{} + switch e.netProto { + case header.IPv4ProtocolNumber: + gso.Type = stack.GSOTCPv4 + gso.L3HdrLen = header.IPv4MinimumSize + case header.IPv6ProtocolNumber: + gso.Type = stack.GSOTCPv6 + gso.L3HdrLen = header.IPv6MinimumSize + default: + panic(fmt.Sprintf("Unknown netProto: %v", e.netProto)) + } + gso.NeedsCsum = true + gso.CsumOffset = header.TCPChecksumOffset() + gso.MaxSize = e.route.GSOMaxSize() + e.gso = gso +} diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 8a42f8593..230668b5d 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -153,7 +153,7 @@ func replyWithReset(s *segment) { ack := s.sequenceNumber.Add(s.logicalLen()) - sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0, nil) + sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0, nil /* options */, nil /* gso */) } // SetOption implements TransportProtocol.SetOption. diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index d751c7d8e..6317748cf 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -129,6 +129,9 @@ type sender struct { // It is initialized on demand. maxPayloadSize int + // gso is set if generic segmentation offload is enabled. + gso bool + // sndWndScale is the number of bits to shift left when reading the send // window size from a segment. sndWndScale uint8 @@ -194,6 +197,11 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. last: iss, }, + gso: ep.gso != nil, + } + + if s.gso { + s.ep.gso.MSS = uint16(maxPayloadSize) } s.cc = s.initCongestionControl(ep.cc) @@ -244,6 +252,9 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) { } s.maxPayloadSize = m + if s.gso { + s.ep.gso.MSS = uint16(m) + } s.outstanding -= count if s.outstanding < 0 { @@ -338,6 +349,15 @@ func (s *sender) resendSegment() { // Resend the segment. if seg := s.writeList.Front(); seg != nil { + if seg.data.Size() > s.maxPayloadSize { + available := s.maxPayloadSize + // Split this segment up. + nSeg := seg.clone() + nSeg.data.TrimFront(available) + nSeg.sequenceNumber.UpdateForward(seqnum.Size(available)) + s.writeList.InsertAfter(seg, nSeg) + seg.data.CapLength(available) + } s.sendSegment(seg.data, seg.flags, seg.sequenceNumber) s.ep.stack.Stats().TCP.FastRetransmit.Increment() s.ep.stack.Stats().TCP.Retransmits.Increment() @@ -408,11 +428,24 @@ func (s *sender) retransmitTimerExpired() bool { return true } +// pCount returns the number of packets in the segment. Due to GSO, a segment +// can be composed of multiple packets. +func (s *sender) pCount(seg *segment) int { + size := seg.data.Size() + if size == 0 { + return 1 + } + + return (size-1)/s.maxPayloadSize + 1 +} + // sendData sends new data segments. It is called when data becomes available or // when the send window opens up. func (s *sender) sendData() { limit := s.maxPayloadSize - + if s.gso { + limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize) + } // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. // "A TCP SHOULD set cwnd to no more than RW before beginning // transmission if the TCP has not sent data in the interval exceeding @@ -427,6 +460,10 @@ func (s *sender) sendData() { end := s.sndUna.Add(s.sndWnd) var dataSent bool for ; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() { + cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize + if cwndLimit < limit { + limit = cwndLimit + } // We abuse the flags field to determine if we have already // assigned a sequence number to this segment. if seg.flags == 0 { @@ -518,7 +555,7 @@ func (s *sender) sendData() { seg.data.CapLength(available) } - s.outstanding++ + s.outstanding += s.pCount(seg) segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) } @@ -744,8 +781,10 @@ func (s *sender) handleRcvdSegment(seg *segment) { datalen := seg.logicalLen() if datalen > ackLeft { + prevCount := s.pCount(seg) seg.data.TrimFront(int(ackLeft)) seg.sequenceNumber.UpdateForward(ackLeft) + s.outstanding -= prevCount - s.pCount(seg) break } @@ -753,7 +792,7 @@ func (s *sender) handleRcvdSegment(seg *segment) { s.writeNext = seg.Next() } s.writeList.Remove(seg) - s.outstanding-- + s.outstanding -= s.pCount(seg) seg.decRef() ackLeft -= datalen } diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go index aa2a73829..5cef8ee97 100644 --- a/pkg/tcpip/transport/tcp/testing/context/context.go +++ b/pkg/tcpip/transport/tcp/testing/context/context.go @@ -699,7 +699,7 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) * synOptions := header.ParseSynOptions(tcpSeg.Options(), false) // Build options w/ tsVal to be sent in the SYN-ACK. - synAckOptions := make([]byte, 40) + synAckOptions := make([]byte, header.TCPOptionsMaximumSize) offset := 0 if wantOptions.TS { offset += header.EncodeTSOption(wantOptions.TSVal, synOptions.TSVal, synAckOptions[offset:]) @@ -847,7 +847,7 @@ func (c *Context) PassiveConnect(maxPayload, wndScale int, synOptions header.TCP // value of the window scaling option to be sent in the SYN. If synOptions.WS > // 0 then we send the WindowScale option. func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions header.TCPSynOptions) *RawEndpoint { - opts := make([]byte, 40) + opts := make([]byte, header.TCPOptionsMaximumSize) offset := 0 offset += header.EncodeMSSOption(uint32(maxPayload), opts) |