// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tcp contains the implementation of the TCP transport protocol. To use // it in the networking stack, this package must be added to the project, and // activated on the stack by passing tcp.NewProtocol() as one of the // transport protocols when calling stack.New(). Then endpoints can be created // by passing tcp.ProtocolNumber as the transport protocol number when calling // Stack.NewEndpoint(). package tcp import ( "runtime" "strings" "time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/raw" "gvisor.dev/gvisor/pkg/waiter" ) const ( // ProtocolNumber is the tcp protocol number. ProtocolNumber = header.TCPProtocolNumber // MinBufferSize is the smallest size of a receive or send buffer. MinBufferSize = 4 << 10 // 4096 bytes. // DefaultSendBufferSize is the default size of the send buffer for // an endpoint. DefaultSendBufferSize = 1 << 20 // 1MB // DefaultReceiveBufferSize is the default size of the receive buffer // for an endpoint. DefaultReceiveBufferSize = 1 << 20 // 1MB // MaxBufferSize is the largest size a receive/send buffer can grow to. MaxBufferSize = 4 << 20 // 4MB // MaxUnprocessedSegments is the maximum number of unprocessed segments // that can be queued for a given endpoint. MaxUnprocessedSegments = 300 // DefaultTCPLingerTimeout is the amount of time that sockets linger in // FIN_WAIT_2 state before being marked closed. DefaultTCPLingerTimeout = 60 * time.Second // DefaultTCPTimeWaitTimeout is the amount of time that sockets linger // in TIME_WAIT state before being marked closed. DefaultTCPTimeWaitTimeout = 60 * time.Second ) // SACKEnabled option can be used to enable SACK support in the TCP // protocol. See: https://tools.ietf.org/html/rfc2018. type SACKEnabled bool // DelayEnabled option can be used to enable Nagle's algorithm in the TCP protocol. type DelayEnabled bool // SendBufferSizeOption allows the default, min and max send buffer sizes for // TCP endpoints to be queried or configured. type SendBufferSizeOption struct { Min int Default int Max int } // ReceiveBufferSizeOption allows the default, min and max receive buffer size // for TCP endpoints to be queried or configured. type ReceiveBufferSizeOption struct { Min int Default int Max int } const ( ccReno = "reno" ccCubic = "cubic" ) // syncRcvdCounter tracks the number of endpoints in the SYN-RCVD state. The // value is protected by a mutex so that we can increment only when it's // guaranteed not to go above a threshold. type synRcvdCounter struct { sync.Mutex value uint64 pending sync.WaitGroup threshold uint64 } // inc tries to increment the global number of endpoints in SYN-RCVD state. It // succeeds if the increment doesn't make the count go beyond the threshold, and // fails otherwise. func (s *synRcvdCounter) inc() bool { s.Lock() defer s.Unlock() if s.value >= s.threshold { return false } s.pending.Add(1) s.value++ return true } // dec atomically decrements the global number of endpoints in SYN-RCVD // state. It must only be called if a previous call to inc succeeded. func (s *synRcvdCounter) dec() { s.Lock() defer s.Unlock() s.value-- s.pending.Done() } // synCookiesInUse returns true if the synRcvdCount is greater than // SynRcvdCountThreshold. func (s *synRcvdCounter) synCookiesInUse() bool { s.Lock() defer s.Unlock() return s.value >= s.threshold } // SetThreshold sets synRcvdCounter.Threshold to ths new threshold. func (s *synRcvdCounter) SetThreshold(threshold uint64) { s.Lock() defer s.Unlock() s.threshold = threshold } // Threshold returns the current value of synRcvdCounter.Threhsold. func (s *synRcvdCounter) Threshold() uint64 { s.Lock() defer s.Unlock() return s.threshold } type protocol struct { mu sync.RWMutex sackEnabled bool delayEnabled bool sendBufferSize SendBufferSizeOption recvBufferSize ReceiveBufferSizeOption congestionControl string availableCongestionControl []string moderateReceiveBuffer bool tcpLingerTimeout time.Duration tcpTimeWaitTimeout time.Duration minRTO time.Duration synRcvdCount synRcvdCounter dispatcher *dispatcher } // Number returns the tcp protocol number. func (*protocol) Number() tcpip.TransportProtocolNumber { return ProtocolNumber } // NewEndpoint creates a new tcp endpoint. func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { return newEndpoint(stack, netProto, waiterQueue), nil } // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently // unsupported. It implements stack.TransportProtocol.NewRawEndpoint. func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { return raw.NewEndpoint(stack, netProto, header.TCPProtocolNumber, waiterQueue) } // MinimumPacketSize returns the minimum valid tcp packet size. func (*protocol) MinimumPacketSize() int { return header.TCPMinimumSize } // ParsePorts returns the source and destination ports stored in the given tcp // packet. func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) { h := header.TCP(v) return h.SourcePort(), h.DestinationPort(), nil } // QueuePacket queues packets targeted at an endpoint after hashing the packet // to a specific processing queue. Each queue is serviced by its own processor // goroutine which is responsible for dequeuing and doing full TCP dispatch of // the packet. func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt stack.PacketBuffer) { p.dispatcher.queuePacket(r, ep, id, pkt) } // HandleUnknownDestinationPacket handles packets targeted at this protocol but // that don't match any existing endpoint. // // RFC 793, page 36, states that "If the connection does not exist (CLOSED) then // a reset is sent in response to any incoming segment except another reset. In // particular, SYNs addressed to a non-existent connection are rejected by this // means." func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool { s := newSegment(r, id, pkt) defer s.decRef() if !s.parse() || !s.csumValid { return false } // There's nothing to do if this is already a reset packet. if s.flagIsSet(header.TCPFlagRst) { return true } replyWithReset(s) return true } // replyWithReset replies to the given segment with a reset segment. func replyWithReset(s *segment) { // Get the seqnum from the packet if the ack flag is set. seq := seqnum.Value(0) ack := seqnum.Value(0) flags := byte(header.TCPFlagRst) // As per RFC 793 page 35 (Reset Generation) // 1. If the connection does not exist (CLOSED) then a reset is sent // in response to any incoming segment except another reset. In // particular, SYNs addressed to a non-existent connection are rejected // by this means. // If the incoming segment has an ACK field, the reset takes its // sequence number from the ACK field of the segment, otherwise the // reset has sequence number zero and the ACK field is set to the sum // of the sequence number and segment length of the incoming segment. // The connection remains in the CLOSED state. if s.flagIsSet(header.TCPFlagAck) { seq = s.ackNumber } else { flags |= header.TCPFlagAck ack = s.sequenceNumber.Add(s.logicalLen()) } sendTCP(&s.route, tcpFields{ id: s.id, ttl: s.route.DefaultTTL(), tos: stack.DefaultTOS, flags: flags, seq: seq, ack: ack, rcvWnd: 0, }, buffer.VectorisedView{}, nil /* gso */, nil /* PacketOwner */) } // SetOption implements stack.TransportProtocol.SetOption. func (p *protocol) SetOption(option interface{}) *tcpip.Error { switch v := option.(type) { case SACKEnabled: p.mu.Lock() p.sackEnabled = bool(v) p.mu.Unlock() return nil case DelayEnabled: p.mu.Lock() p.delayEnabled = bool(v) p.mu.Unlock() return nil case SendBufferSizeOption: if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max { return tcpip.ErrInvalidOptionValue } p.mu.Lock() p.sendBufferSize = v p.mu.Unlock() return nil case ReceiveBufferSizeOption: if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max { return tcpip.ErrInvalidOptionValue } p.mu.Lock() p.recvBufferSize = v p.mu.Unlock() return nil case tcpip.CongestionControlOption: for _, c := range p.availableCongestionControl { if string(v) == c { p.mu.Lock() p.congestionControl = string(v) p.mu.Unlock() return nil } } // linux returns ENOENT when an invalid congestion control // is specified. return tcpip.ErrNoSuchFile case tcpip.ModerateReceiveBufferOption: p.mu.Lock() p.moderateReceiveBuffer = bool(v) p.mu.Unlock() return nil case tcpip.TCPLingerTimeoutOption: if v < 0 { v = 0 } p.mu.Lock() p.tcpLingerTimeout = time.Duration(v) p.mu.Unlock() return nil case tcpip.TCPTimeWaitTimeoutOption: if v < 0 { v = 0 } p.mu.Lock() p.tcpTimeWaitTimeout = time.Duration(v) p.mu.Unlock() return nil case tcpip.TCPMinRTOOption: if v < 0 { v = tcpip.TCPMinRTOOption(MinRTO) } p.mu.Lock() p.minRTO = time.Duration(v) p.mu.Unlock() return nil case tcpip.TCPSynRcvdCountThresholdOption: p.mu.Lock() p.synRcvdCount.SetThreshold(uint64(v)) p.mu.Unlock() return nil default: return tcpip.ErrUnknownProtocolOption } } // Option implements stack.TransportProtocol.Option. func (p *protocol) Option(option interface{}) *tcpip.Error { switch v := option.(type) { case *SACKEnabled: p.mu.RLock() *v = SACKEnabled(p.sackEnabled) p.mu.RUnlock() return nil case *DelayEnabled: p.mu.RLock() *v = DelayEnabled(p.delayEnabled) p.mu.RUnlock() return nil case *SendBufferSizeOption: p.mu.RLock() *v = p.sendBufferSize p.mu.RUnlock() return nil case *ReceiveBufferSizeOption: p.mu.RLock() *v = p.recvBufferSize p.mu.RUnlock() return nil case *tcpip.CongestionControlOption: p.mu.RLock() *v = tcpip.CongestionControlOption(p.congestionControl) p.mu.RUnlock() return nil case *tcpip.AvailableCongestionControlOption: p.mu.RLock() *v = tcpip.AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " ")) p.mu.RUnlock() return nil case *tcpip.ModerateReceiveBufferOption: p.mu.RLock() *v = tcpip.ModerateReceiveBufferOption(p.moderateReceiveBuffer) p.mu.RUnlock() return nil case *tcpip.TCPLingerTimeoutOption: p.mu.RLock() *v = tcpip.TCPLingerTimeoutOption(p.tcpLingerTimeout) p.mu.RUnlock() return nil case *tcpip.TCPTimeWaitTimeoutOption: p.mu.RLock() *v = tcpip.TCPTimeWaitTimeoutOption(p.tcpTimeWaitTimeout) p.mu.RUnlock() return nil case *tcpip.TCPMinRTOOption: p.mu.RLock() *v = tcpip.TCPMinRTOOption(p.minRTO) p.mu.RUnlock() return nil case *tcpip.TCPSynRcvdCountThresholdOption: p.mu.RLock() *v = tcpip.TCPSynRcvdCountThresholdOption(p.synRcvdCount.Threshold()) p.mu.RUnlock() return nil default: return tcpip.ErrUnknownProtocolOption } } // Close implements stack.TransportProtocol.Close. func (p *protocol) Close() { p.dispatcher.close() } // Wait implements stack.TransportProtocol.Wait. func (p *protocol) Wait() { p.dispatcher.wait() } // SynRcvdCounter returns a reference to the synRcvdCount for this protocol // instance. func (p *protocol) SynRcvdCounter() *synRcvdCounter { return &p.synRcvdCount } // NewProtocol returns a TCP transport protocol. func NewProtocol() stack.TransportProtocol { return &protocol{ sendBufferSize: SendBufferSizeOption{MinBufferSize, DefaultSendBufferSize, MaxBufferSize}, recvBufferSize: ReceiveBufferSizeOption{MinBufferSize, DefaultReceiveBufferSize, MaxBufferSize}, congestionControl: ccReno, availableCongestionControl: []string{ccReno, ccCubic}, tcpLingerTimeout: DefaultTCPLingerTimeout, tcpTimeWaitTimeout: DefaultTCPTimeWaitTimeout, synRcvdCount: synRcvdCounter{threshold: SynRcvdCountThreshold}, dispatcher: newDispatcher(runtime.GOMAXPROCS(0)), minRTO: MinRTO, } }