// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
	"crypto/sha1"
	"encoding/binary"
	"fmt"
	"hash"
	"io"
	"sync/atomic"
	"time"

	"gvisor.dev/gvisor/pkg/sleep"
	"gvisor.dev/gvisor/pkg/sync"
	"gvisor.dev/gvisor/pkg/tcpip"
	"gvisor.dev/gvisor/pkg/tcpip/header"
	"gvisor.dev/gvisor/pkg/tcpip/ports"
	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
	"gvisor.dev/gvisor/pkg/tcpip/stack"
	"gvisor.dev/gvisor/pkg/waiter"
)

const (
	// tsLen is the length, in bits, of the timestamp in the SYN cookie.
	tsLen = 8

	// tsMask is a mask for timestamp values (i.e., tsLen bits).
	tsMask = (1 << tsLen) - 1

	// tsOffset is the offset, in bits, of the timestamp in the SYN cookie.
	tsOffset = 24

	// hashMask is the mask for hash values (i.e., tsOffset bits).
	hashMask = (1 << tsOffset) - 1

	// maxTSDiff is the maximum allowed difference between a received cookie
	// timestamp and the current timestamp. If the difference is greater
	// than maxTSDiff, the cookie is expired.
	maxTSDiff = 2
)

var (
	// mssTable is a slice containing the possible MSS values that we
	// encode in the SYN cookie with two bits.
	mssTable = []uint16{536, 1300, 1440, 1460}
)

func encodeMSS(mss uint16) uint32 {
	for i := len(mssTable) - 1; i > 0; i-- {
		if mss >= mssTable[i] {
			return uint32(i)
		}
	}
	return 0
}

// listenContext is used by a listening endpoint to store state used while
// listening for connections. This struct is allocated by the listen goroutine
// and must not be accessed or have its methods called concurrently as they
// may mutate the stored objects.
type listenContext struct {
	stack    *stack.Stack
	protocol *protocol

	// rcvWnd is the receive window that is sent by this listening context
	// in the initial SYN-ACK.
	rcvWnd seqnum.Size

	// nonce are random bytes that are initialized once when the context
	// is created and used to seed the hash function when generating
	// the SYN cookie.
	nonce [2][sha1.BlockSize]byte

	// listenEP is a reference to the listening endpoint associated with
	// this context. Can be nil if the context is created by the forwarder.
	listenEP *endpoint

	// hasherMu protects hasher.
	hasherMu sync.Mutex
	// hasher is the hash function used to generate a SYN cookie.
	hasher hash.Hash

	// v6Only is true if listenEP is a dual stack socket and has the
	// IPV6_V6ONLY option set.
	v6Only bool

	// netProto indicates the network protocol(IPv4/v6) for the listening
	// endpoint.
	netProto tcpip.NetworkProtocolNumber

	// pendingMu protects pendingEndpoints. This should only be accessed
	// by the listening endpoint's worker goroutine.
	//
	// Lock Ordering: listenEP.workerMu -> pendingMu
	pendingMu sync.Mutex
	// pending is used to wait for all pendingEndpoints to finish when
	// a socket is closed.
	pending sync.WaitGroup
	// pendingEndpoints is a set of all endpoints for which a handshake is
	// in progress.
	pendingEndpoints map[*endpoint]struct{}
}

// timeStamp returns an 8-bit timestamp with a granularity of 64 seconds.
func timeStamp(clock tcpip.Clock) uint32 {
	return uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Seconds()) >> 6 & tsMask
}

// newListenContext creates a new listen context.
func newListenContext(stk *stack.Stack, protocol *protocol, listenEP *endpoint, rcvWnd seqnum.Size, v6Only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
	l := &listenContext{
		stack:            stk,
		protocol:         protocol,
		rcvWnd:           rcvWnd,
		hasher:           sha1.New(),
		v6Only:           v6Only,
		netProto:         netProto,
		listenEP:         listenEP,
		pendingEndpoints: make(map[*endpoint]struct{}),
	}

	for i := range l.nonce {
		if _, err := io.ReadFull(stk.SecureRNG(), l.nonce[i][:]); err != nil {
			panic(err)
		}
	}

	return l
}

// cookieHash calculates the cookieHash for the given id, timestamp and nonce
// index. The hash is used to create and validate cookies.
func (l *listenContext) cookieHash(id stack.TransportEndpointID, ts uint32, nonceIndex int) uint32 {

	// Initialize block with fixed-size data: local ports and v.
	var payload [8]byte
	binary.BigEndian.PutUint16(payload[0:], id.LocalPort)
	binary.BigEndian.PutUint16(payload[2:], id.RemotePort)
	binary.BigEndian.PutUint32(payload[4:], ts)

	// Feed everything to the hasher.
	l.hasherMu.Lock()
	l.hasher.Reset()

	// Per hash.Hash.Writer:
	//
	// It never returns an error.
	l.hasher.Write(payload[:])
	l.hasher.Write(l.nonce[nonceIndex][:])
	l.hasher.Write([]byte(id.LocalAddress))
	l.hasher.Write([]byte(id.RemoteAddress))

	// Finalize the calculation of the hash and return the first 4 bytes.
	h := l.hasher.Sum(nil)
	l.hasherMu.Unlock()

	return binary.BigEndian.Uint32(h[:])
}

// createCookie creates a SYN cookie for the given id and incoming sequence
// number.
func (l *listenContext) createCookie(id stack.TransportEndpointID, seq seqnum.Value, data uint32) seqnum.Value {
	ts := timeStamp(l.stack.Clock())
	v := l.cookieHash(id, 0, 0) + uint32(seq) + (ts << tsOffset)
	v += (l.cookieHash(id, ts, 1) + data) & hashMask
	return seqnum.Value(v)
}

// isCookieValid checks if the supplied cookie is valid for the given id and
// sequence number. If it is, it also returns the data originally encoded in the
// cookie when createCookie was called.
func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnum.Value, seq seqnum.Value) (uint32, bool) {
	ts := timeStamp(l.stack.Clock())
	v := uint32(cookie) - l.cookieHash(id, 0, 0) - uint32(seq)
	cookieTS := v >> tsOffset
	if ((ts - cookieTS) & tsMask) > maxTSDiff {
		return 0, false
	}

	return (v - l.cookieHash(id, cookieTS, 1)) & hashMask, true
}

// createConnectingEndpoint creates a new endpoint in a connecting state, with
// the connection parameters given by the arguments.
func (l *listenContext) createConnectingEndpoint(s *segment, rcvdSynOpts header.TCPSynOptions, queue *waiter.Queue) (*endpoint, tcpip.Error) {
	// Create a new endpoint.
	netProto := l.netProto
	if netProto == 0 {
		netProto = s.netProto
	}

	route, err := l.stack.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */)
	if err != nil {
		return nil, err
	}

	n := newEndpoint(l.stack, l.protocol, netProto, queue)
	n.ops.SetV6Only(l.v6Only)
	n.TransportEndpointInfo.ID = s.id
	n.boundNICID = s.nicID
	n.route = route
	n.effectiveNetProtos = []tcpip.NetworkProtocolNumber{s.netProto}
	n.ops.SetReceiveBufferSize(int64(l.rcvWnd), false /* notify */)
	n.amss = calculateAdvertisedMSS(n.userMSS, n.route)
	n.setEndpointState(StateConnecting)

	n.maybeEnableTimestamp(rcvdSynOpts)
	n.maybeEnableSACKPermitted(rcvdSynOpts)

	n.initGSO()

	// Bootstrap the auto tuning algorithm. Starting at zero will result in
	// a large step function on the first window adjustment causing the
	// window to grow to a really large value.
	n.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = n.initialReceiveWindow()

	return n, nil
}

// startHandshake creates a new endpoint in connecting state and then sends
// the SYN-ACK for the TCP 3-way handshake. It returns the state of the
// handshake in progress, which includes the new endpoint in the SYN-RCVD
// state.
//
// On success, a handshake h is returned with h.ep.mu held.
//
// Precondition: if l.listenEP != nil, l.listenEP.mu must be locked.
func (l *listenContext) startHandshake(s *segment, opts header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*handshake, tcpip.Error) {
	// Create new endpoint.
	irs := s.sequenceNumber
	isn := generateSecureISN(s.id, l.stack.Clock(), l.protocol.seqnumSecret)
	ep, err := l.createConnectingEndpoint(s, opts, queue)
	if err != nil {
		return nil, err
	}

	// Lock the endpoint before registering to ensure that no out of
	// band changes are possible due to incoming packets etc till
	// the endpoint is done initializing.
	ep.mu.Lock()
	ep.owner = owner

	// listenEP is nil when listenContext is used by tcp.Forwarder.
	deferAccept := time.Duration(0)
	if l.listenEP != nil {
		if l.listenEP.EndpointState() != StateListen {

			// Ensure we release any registrations done by the newly
			// created endpoint.
			ep.mu.Unlock()
			ep.Close()

			return nil, &tcpip.ErrConnectionAborted{}
		}
		l.addPendingEndpoint(ep)

		// Propagate any inheritable options from the listening endpoint
		// to the newly created endpoint.
		l.listenEP.propagateInheritableOptionsLocked(ep) // +checklocksforce

		if !ep.reserveTupleLocked() {
			ep.mu.Unlock()
			ep.Close()

			l.removePendingEndpoint(ep)

			return nil, &tcpip.ErrConnectionAborted{}
		}

		deferAccept = l.listenEP.deferAccept
	}

	// Register new endpoint so that packets are routed to it.
	if err := ep.stack.RegisterTransportEndpoint(
		ep.effectiveNetProtos,
		ProtocolNumber,
		ep.TransportEndpointInfo.ID,
		ep,
		ep.boundPortFlags,
		ep.boundBindToDevice,
	); err != nil {
		ep.mu.Unlock()
		ep.Close()

		if l.listenEP != nil {
			l.removePendingEndpoint(ep)
		}

		ep.drainClosingSegmentQueue()

		return nil, err
	}

	ep.isRegistered = true

	// Initialize and start the handshake.
	h := ep.newPassiveHandshake(isn, irs, opts, deferAccept)
	h.listenEP = l.listenEP
	h.start()
	return h, nil
}

// performHandshake performs a TCP 3-way handshake. On success, the new
// established endpoint is returned with e.mu held.
//
// Precondition: if l.listenEP != nil, l.listenEP.mu must be locked.
func (l *listenContext) performHandshake(s *segment, opts header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*endpoint, tcpip.Error) {
	h, err := l.startHandshake(s, opts, queue, owner)
	if err != nil {
		return nil, err
	}
	ep := h.ep

	// N.B. the endpoint is generated above by startHandshake, and will be
	// returned locked. This first call is forced.
	if err := h.complete(); err != nil { // +checklocksforce
		ep.stack.Stats().TCP.FailedConnectionAttempts.Increment()
		ep.stats.FailedConnectionAttempts.Increment()
		l.cleanupFailedHandshake(h)
		return nil, err
	}
	l.cleanupCompletedHandshake(h)
	return ep, nil
}

func (l *listenContext) addPendingEndpoint(n *endpoint) {
	l.pendingMu.Lock()
	l.pendingEndpoints[n] = struct{}{}
	l.pending.Add(1)
	l.pendingMu.Unlock()
}

func (l *listenContext) removePendingEndpoint(n *endpoint) {
	l.pendingMu.Lock()
	delete(l.pendingEndpoints, n)
	l.pending.Done()
	l.pendingMu.Unlock()
}

func (l *listenContext) closeAllPendingEndpoints() {
	l.pendingMu.Lock()
	for n := range l.pendingEndpoints {
		n.notifyProtocolGoroutine(notifyClose)
	}
	l.pendingMu.Unlock()
	l.pending.Wait()
}

// +checklocks:h.ep.mu
func (l *listenContext) cleanupFailedHandshake(h *handshake) {
	e := h.ep
	e.mu.Unlock()
	e.Close()
	e.notifyAborted()
	if l.listenEP != nil {
		l.removePendingEndpoint(e)
	}
	e.drainClosingSegmentQueue()
	e.h = nil
}

// cleanupCompletedHandshake transfers any state from the completed handshake to
// the new endpoint.
//
// +checklocks:h.ep.mu
func (l *listenContext) cleanupCompletedHandshake(h *handshake) {
	e := h.ep
	if l.listenEP != nil {
		l.removePendingEndpoint(e)
	}
	e.isConnectNotified = true

	// Update the receive window scaling. We can't do it before the
	// handshake because it's possible that the peer doesn't support window
	// scaling.
	e.rcv.RcvWndScale = e.h.effectiveRcvWndScale()

	// Clean up handshake state stored in the endpoint so that it can be GCed.
	e.h = nil
}

// propagateInheritableOptionsLocked propagates any options set on the listening
// endpoint to the newly created endpoint.
//
// +checklocks:e.mu
// +checklocks:n.mu
func (e *endpoint) propagateInheritableOptionsLocked(n *endpoint) {
	n.userTimeout = e.userTimeout
	n.portFlags = e.portFlags
	n.boundBindToDevice = e.boundBindToDevice
	n.boundPortFlags = e.boundPortFlags
	n.userMSS = e.userMSS
}

// reserveTupleLocked reserves an accepted endpoint's tuple.
//
// Precondition: e.propagateInheritableOptionsLocked has been called.
//
// +checklocks:e.mu
func (e *endpoint) reserveTupleLocked() bool {
	dest := tcpip.FullAddress{
		Addr: e.TransportEndpointInfo.ID.RemoteAddress,
		Port: e.TransportEndpointInfo.ID.RemotePort,
	}
	portRes := ports.Reservation{
		Networks:     e.effectiveNetProtos,
		Transport:    ProtocolNumber,
		Addr:         e.TransportEndpointInfo.ID.LocalAddress,
		Port:         e.TransportEndpointInfo.ID.LocalPort,
		Flags:        e.boundPortFlags,
		BindToDevice: e.boundBindToDevice,
		Dest:         dest,
	}
	if !e.stack.ReserveTuple(portRes) {
		e.stack.Stats().TCP.FailedPortReservations.Increment()
		return false
	}

	e.isPortReserved = true
	e.boundDest = dest
	return true
}

// notifyAborted wakes up any waiters on registered, but not accepted
// endpoints.
//
// This is strictly not required normally as a socket that was never accepted
// can't really have any registered waiters except when stack.Wait() is called
// which waits for all registered endpoints to stop and expects an EventHUp.
func (e *endpoint) notifyAborted() {
	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
}

// handleSynSegment is called in its own goroutine once the listening endpoint
// receives a SYN segment. It is responsible for completing the handshake and
// queueing the new endpoint for acceptance.
//
// A limited number of these goroutines are allowed before TCP starts using SYN
// cookies to accept connections.
//
// +checklocks:e.mu
func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts header.TCPSynOptions) tcpip.Error {
	defer s.decRef()

	h, err := ctx.startHandshake(s, opts, &waiter.Queue{}, e.owner)
	if err != nil {
		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
		e.stats.FailedConnectionAttempts.Increment()
		atomic.AddInt32(&e.synRcvdCount, -1)
		return err
	}

	go func() {
		// Note that startHandshake returns a locked endpoint. The
		// force call here just makes it so.
		if err := h.complete(); err != nil { // +checklocksforce
			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
			e.stats.FailedConnectionAttempts.Increment()
			ctx.cleanupFailedHandshake(h)
			atomic.AddInt32(&e.synRcvdCount, -1)
			return
		}
		ctx.cleanupCompletedHandshake(h)
		h.ep.startAcceptedLoop()
		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()

		// Deliver the endpoint to the accept queue.
		e.mu.Lock()
		e.pendingAccepted.Add(1)
		e.mu.Unlock()
		defer e.pendingAccepted.Done()

		// Drop the lock before notifying to avoid deadlock in user-specified
		// callbacks.
		delivered := func() bool {
			e.acceptMu.Lock()
			defer e.acceptMu.Unlock()
			for {
				if e.accepted == (accepted{}) {
					// If the listener has transitioned out of the listen state (accepted
					// is the zero value), the new endpoint is reset instead.
					return false
				}
				if e.accepted.acceptQueueIsFullLocked() {
					e.acceptCond.Wait()
					continue
				}

				e.accepted.endpoints.PushBack(h.ep)
				atomic.AddInt32(&e.synRcvdCount, -1)
				return true
			}
		}()

		if delivered {
			e.waiterQueue.Notify(waiter.ReadableEvents)
		} else {
			h.ep.notifyProtocolGoroutine(notifyReset)
		}
	}()

	return nil
}

func (e *endpoint) synRcvdBacklogFull() bool {
	e.acceptMu.Lock()
	acceptedCap := e.accepted.cap
	e.acceptMu.Unlock()
	// The capacity of the accepted queue would always be one greater than the
	// listen backlog. But, the SYNRCVD connections count is always checked
	// against the listen backlog value for Linux parity reason.
	// https://github.com/torvalds/linux/blob/7acac4b3196/include/net/inet_connection_sock.h#L280
	//
	// We maintain an equality check here as the synRcvdCount is incremented
	// and compared only from a single listener context and the capacity of
	// the accepted queue can only increase by a new listen call.
	return int(atomic.LoadInt32(&e.synRcvdCount)) == acceptedCap-1
}

func (e *endpoint) acceptQueueIsFull() bool {
	e.acceptMu.Lock()
	full := e.accepted.acceptQueueIsFullLocked()
	e.acceptMu.Unlock()
	return full
}

func (a *accepted) acceptQueueIsFullLocked() bool {
	return a.endpoints.Len() == a.cap
}

// handleListenSegment is called when a listening endpoint receives a segment
// and needs to handle it.
//
// +checklocks:e.mu
func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Error {
	e.rcvQueueInfo.rcvQueueMu.Lock()
	rcvClosed := e.rcvQueueInfo.RcvClosed
	e.rcvQueueInfo.rcvQueueMu.Unlock()
	if rcvClosed || s.flags.Contains(header.TCPFlagSyn|header.TCPFlagAck) {
		// If the endpoint is shutdown, reply with reset.
		//
		// RFC 793 section 3.4 page 35 (figure 12) outlines that a RST
		// must be sent in response to a SYN-ACK while in the listen
		// state to prevent completing a handshake from an old SYN.
		return replyWithReset(e.stack, s, e.sendTOS, e.ttl)
	}

	switch {
	case s.flags.Contains(header.TCPFlagRst):
		e.stack.Stats().DroppedPackets.Increment()
		return nil

	case s.flags == header.TCPFlagSyn:
		if e.acceptQueueIsFull() {
			e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
			e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment()
			e.stack.Stats().DroppedPackets.Increment()
			return nil
		}

		alwaysUseSynCookies := func() bool {
			var alwaysUseSynCookies tcpip.TCPAlwaysUseSynCookies
			if err := e.stack.TransportProtocolOption(header.TCPProtocolNumber, &alwaysUseSynCookies); err != nil {
				panic(fmt.Sprintf("TransportProtocolOption(%d, %T) = %s", header.TCPProtocolNumber, alwaysUseSynCookies, err))
			}
			return bool(alwaysUseSynCookies)
		}()

		opts := parseSynSegmentOptions(s)
		if !alwaysUseSynCookies && !e.synRcvdBacklogFull() {
			s.incRef()
			atomic.AddInt32(&e.synRcvdCount, 1)
			return e.handleSynSegment(ctx, s, opts)
		}
		route, err := e.stack.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */)
		if err != nil {
			return err
		}
		defer route.Release()

		// Send SYN without window scaling because we currently
		// don't encode this information in the cookie.
		//
		// Enable Timestamp option if the original syn did have
		// the timestamp option specified.
		//
		// Use the user supplied MSS on the listening socket for
		// new connections, if available.
		synOpts := header.TCPSynOptions{
			WS:    -1,
			TS:    opts.TS,
			TSEcr: opts.TSVal,
			MSS:   calculateAdvertisedMSS(e.userMSS, route),
		}
		if opts.TS {
			offset := e.protocol.tsOffset(s.dstAddr, s.srcAddr)
			now := e.stack.Clock().NowMonotonic()
			synOpts.TSVal = offset.TSVal(now)
		}
		cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS))
		fields := tcpFields{
			id:     s.id,
			ttl:    e.ttl,
			tos:    e.sendTOS,
			flags:  header.TCPFlagSyn | header.TCPFlagAck,
			seq:    cookie,
			ack:    s.sequenceNumber + 1,
			rcvWnd: ctx.rcvWnd,
		}
		if err := e.sendSynTCP(route, fields, synOpts); err != nil {
			return err
		}
		e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
		return nil

	case s.flags.Contains(header.TCPFlagAck):
		// Keep hold of acceptMu until the new endpoint is in the accept queue (or
		// if there is an error), to guarantee that we will keep our spot in the
		// queue even if another handshake from the syn queue completes.
		e.acceptMu.Lock()
		if e.accepted.acceptQueueIsFullLocked() {
			// Silently drop the ack as the application can't accept
			// the connection at this point. The ack will be
			// retransmitted by the sender anyway and we can
			// complete the connection at the time of retransmit if
			// the backlog has space.
			e.acceptMu.Unlock()
			e.stack.Stats().TCP.ListenOverflowAckDrop.Increment()
			e.stats.ReceiveErrors.ListenOverflowAckDrop.Increment()
			e.stack.Stats().DroppedPackets.Increment()
			return nil
		}

		iss := s.ackNumber - 1
		irs := s.sequenceNumber - 1

		// Since SYN cookies are in use this is potentially an ACK to a
		// SYN-ACK we sent but don't have a half open connection state
		// as cookies are being used to protect against a potential SYN
		// flood. In such cases validate the cookie and if valid create
		// a fully connected endpoint and deliver to the accept queue.
		//
		// If not, silently drop the ACK to avoid leaking information
		// when under a potential syn flood attack.
		//
		// Validate the cookie.
		data, ok := ctx.isCookieValid(s.id, iss, irs)
		if !ok || int(data) >= len(mssTable) {
			e.acceptMu.Unlock()
			e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment()
			e.stack.Stats().DroppedPackets.Increment()

			// When not using SYN cookies, as per RFC 793, section 3.9, page 64:
			// Any acknowledgment is bad if it arrives on a connection still in
			// the LISTEN state.  An acceptable reset segment should be formed
			// for any arriving ACK-bearing segment.  The RST should be
			// formatted as follows:
			//
			//  <SEQ=SEG.ACK><CTL=RST>
			//
			// Send a reset as this is an ACK for which there is no
			// half open connections and we are not using cookies
			// yet.
			//
			// The only time we should reach here when a connection
			// was opened and closed really quickly and a delayed
			// ACK was received from the sender.
			return replyWithReset(e.stack, s, e.sendTOS, e.ttl)
		}
		e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment()
		// Create newly accepted endpoint and deliver it.
		rcvdSynOptions := header.TCPSynOptions{
			MSS: mssTable[data],
			// Disable Window scaling as original SYN is
			// lost.
			WS: -1,
		}

		// When syn cookies are in use we enable timestamp only
		// if the ack specifies the timestamp option assuming
		// that the other end did in fact negotiate the
		// timestamp option in the original SYN.
		if s.parsedOptions.TS {
			rcvdSynOptions.TS = true
			rcvdSynOptions.TSVal = s.parsedOptions.TSVal
			rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
		}

		n, err := ctx.createConnectingEndpoint(s, rcvdSynOptions, &waiter.Queue{})
		if err != nil {
			e.acceptMu.Unlock()
			return err
		}

		n.mu.Lock()

		// Propagate any inheritable options from the listening endpoint
		// to the newly created endpoint.
		e.propagateInheritableOptionsLocked(n)

		if !n.reserveTupleLocked() {
			n.mu.Unlock()
			e.acceptMu.Unlock()
			n.Close()

			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
			e.stats.FailedConnectionAttempts.Increment()
			return nil
		}

		// Register new endpoint so that packets are routed to it.
		if err := n.stack.RegisterTransportEndpoint(
			n.effectiveNetProtos,
			ProtocolNumber,
			n.TransportEndpointInfo.ID,
			n,
			n.boundPortFlags,
			n.boundBindToDevice,
		); err != nil {
			n.mu.Unlock()
			e.acceptMu.Unlock()
			n.Close()

			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
			e.stats.FailedConnectionAttempts.Increment()
			return err
		}

		n.isRegistered = true
		n.TSOffset = n.protocol.tsOffset(s.dstAddr, s.srcAddr)

		// Switch state to connected.
		n.isConnectNotified = true
		h := handshake{
			ep:                  n,
			iss:                 iss,
			ackNum:              irs + 1,
			rcvWnd:              seqnum.Size(n.initialReceiveWindow()),
			sndWnd:              s.window,
			rcvWndScale:         e.rcvWndScaleForHandshake(),
			sndWndScale:         rcvdSynOptions.WS,
			mss:                 rcvdSynOptions.MSS,
			sampleRTTWithTSOnly: true,
		}
		h.transitionToStateEstablishedLocked(s)

		// Requeue the segment if the ACK completing the handshake has more info
		// to be procesed by the newly established endpoint.
		if (s.flags.Contains(header.TCPFlagFin) || s.data.Size() > 0) && n.enqueueSegment(s) {
			s.incRef()
			n.newSegmentWaker.Assert()
		}

		// Start the protocol goroutine.
		n.startAcceptedLoop()
		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()

		// Deliver the endpoint to the accept queue.
		e.accepted.endpoints.PushBack(n)
		e.acceptMu.Unlock()

		e.waiterQueue.Notify(waiter.ReadableEvents)
		return nil

	default:
		e.stack.Stats().DroppedPackets.Increment()
		return nil
	}
}

// protocolListenLoop is the main loop of a listening TCP endpoint. It runs in
// its own goroutine and is responsible for handling connection requests.
func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) {
	e.mu.Lock()
	v6Only := e.ops.GetV6Only()
	ctx := newListenContext(e.stack, e.protocol, e, rcvWnd, v6Only, e.NetProto)

	defer func() {
		// Mark endpoint as closed. This will prevent goroutines running
		// handleSynSegment() from attempting to queue new connections
		// to the endpoint.
		e.setEndpointState(StateClose)

		// Close any endpoints in SYN-RCVD state.
		ctx.closeAllPendingEndpoints()

		// Do cleanup if needed.
		e.completeWorkerLocked()

		if e.drainDone != nil {
			close(e.drainDone)
		}
		e.mu.Unlock()

		e.drainClosingSegmentQueue()

		// Notify waiters that the endpoint is shutdown.
		e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
	}()

	var s sleep.Sleeper
	s.AddWaker(&e.notificationWaker, wakerForNotification)
	s.AddWaker(&e.newSegmentWaker, wakerForNewSegment)
	for {
		e.mu.Unlock()
		index, _ := s.Fetch(true)
		e.mu.Lock()
		switch index {
		case wakerForNotification:
			n := e.fetchNotifications()
			if n&notifyClose != 0 {
				return
			}
			if n&notifyDrain != 0 {
				for !e.segmentQueue.empty() {
					s := e.segmentQueue.dequeue()
					// TODO(gvisor.dev/issue/4690): Better handle errors instead of
					// silently dropping.
					_ = e.handleListenSegment(ctx, s)
					s.decRef()
				}
				close(e.drainDone)
				e.mu.Unlock()
				<-e.undrain
				e.mu.Lock()
			}

		case wakerForNewSegment:
			// Process at most maxSegmentsPerWake segments.
			mayRequeue := true
			for i := 0; i < maxSegmentsPerWake; i++ {
				s := e.segmentQueue.dequeue()
				if s == nil {
					mayRequeue = false
					break
				}

				// TODO(gvisor.dev/issue/4690): Better handle errors instead of
				// silently dropping.
				_ = e.handleListenSegment(ctx, s)
				s.decRef()
			}

			// If the queue is not empty, make sure we'll wake up
			// in the next iteration.
			if mayRequeue && !e.segmentQueue.empty() {
				e.newSegmentWaker.Assert()
			}
		}
	}
}